Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks.

2025-07-01 12:00:49 +08:00 · 2025-03-27 10:30:07 -07:00 · 2025-03-27 10:30:07 -07:00 · ceab6e8bcc
commit ceab6e8bcc
parent 2cd58fbc9a
782 changed files with 107230 additions and 106548 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,49 @@
 ---
 AccessModifierOffset: -4
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: Consecutive
 AlignConsecutiveDeclarations: Consecutive
 AlignConsecutiveMacros: Consecutive
 AlignEscapedNewlines: Left
 AlignOperands: AlignAfterOperator
 AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: false
 BinPackArguments: false
 BinPackParameters: false
 BraceWrapping:
    AfterClass: true
    AfterControlStatement: false
    AfterExternBlock: true
    AfterFunction: true
    AfterStruct: true
    AfterUnion: true
    BeforeCatch: true
    BeforeElse: true
    IndentBraces: false
 BreakBeforeBraces: Custom
 BreakBeforeConceptDeclarations: true
 BreakBeforeBinaryOperators: NonAssignment
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeComma
 BreakInheritanceList: BeforeComma
 ColumnLimit: 120
 DerivePointerAlignment: false
 FixNamespaceComments: true
 IncludeCategories:
  - Regex:           '^<.*>'
    Priority:        1
  - Regex:           '^".*"'
    Priority:        2
 SortIncludes: true
 IncludeBlocks: Regroup
 IndentWidth: 4
 MaxEmptyLinesToKeep: 2
 PointerAlignment: Right
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 Standard: c++17
 TabWidth: 4
 UseTab: Never
 ...
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,100 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 ci:
    autofix_commit_msg: |
      [pre-commit.ci] auto code formatting
    autofix_prs: false
    autoupdate_branch: ''
    autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
    autoupdate_schedule: quarterly
    skip: []
    submodules: false
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0
    hooks:
      - id: end-of-file-fixer
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$
          )
      - id: mixed-line-ending
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$
          )
      - id: trailing-whitespace
        exclude: |
          (?x)^(
            .*\.raw$|
            .*\.bin$|
            .*\.dat$|
            .*\.nv12$|
            data/.*|
            Common/.*
          )
        files: |
          (?x)^(
            .*\.txt$|
            .*\.md$|
            .*\.cpp$|
            .*\.cxx$|
            .*\.hpp$|
            .*\.h$|
            .*\.cu$|
            .*\.cuh$
          )
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v19.1.6
    hooks:
      - id: clang-format
        types_or: [file]
        files: |
          (?x)^(
            ^.*\.c$|
            ^.*\.cpp$|
            ^.*\.cu$|
            ^.*\.cuh$|
            ^.*\.cxx$|
            ^.*\.h$|
            ^.*\.hpp$|
            ^.*\.inl$|
            ^.*\.mm$
          )
        exclude: |
          (?x)^(
            Common/.*
          )
        args: ["-fallback-style=none", "-style=file", "-i"]
--- a/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
+++ b/Samples/0_Introduction/UnifiedMemoryStreams/UnifiedMemoryStreams.cu
@ -31,10 +31,10 @@
 */
 // system includes
 #include <algorithm>
 #include <cstdio>
 #include <ctime>
 #include <vector>
 #include <algorithm>
 #ifdef USE_PTHREADS
 #include <pthread.h>
 #else
@ -51,291 +51,287 @@
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
 // functions
-void srand48(long seed) { srand((unsigned int)seed); }
+void   srand48(long seed) { srand((unsigned int)seed); }
 double drand48() { return double(rand()) / RAND_MAX; }
 #endif
 const char *sSDKname = "UnifiedMemoryStreams";
 // simple task
-template <typename T>
+template <typename T> struct Task
-struct Task {
+{
-  unsigned int size, id;
+    unsigned int size, id;
-  T *data;
+    T           *data;
-  T *result;
+    T           *result;
-  T *vector;
+    T           *vector;
-  Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
+    Task()
-  Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
+        : size(0)
-    // allocate unified memory -- the operation performed in this example will
+        , id(0)
-    // be a DGEMV
+        , data(NULL)
-    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
+        , result(NULL)
-    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
+        , vector(NULL) {};
-    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
+    Task(unsigned int s)
-    checkCudaErrors(cudaDeviceSynchronize());
+        : size(s)
-  }
+        , id(0)
-
+        , data(NULL)
-  ~Task() {
+        , result(NULL)
-    // ensure all memory is deallocated
+    {
-    checkCudaErrors(cudaDeviceSynchronize());
+        // allocate unified memory -- the operation performed in this example will
-    checkCudaErrors(cudaFree(data));
+        // be a DGEMV
-    checkCudaErrors(cudaFree(result));
+        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
-    checkCudaErrors(cudaFree(vector));
+        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
-  }
+        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
-
+        checkCudaErrors(cudaDeviceSynchronize());
  void allocate(const unsigned int s, const unsigned int unique_id) {
    // allocate unified memory outside of constructor
    id = unique_id;
    size = s;
    checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
    checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
    checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
    checkCudaErrors(cudaDeviceSynchronize());
    // populate data with random elements
    for (unsigned int i = 0; i < size * size; i++) {
      data[i] = drand48();
    }
-    for (unsigned int i = 0; i < size; i++) {
+    ~Task()
-      result[i] = 0.;
+    {
-      vector[i] = drand48();
+        // ensure all memory is deallocated
        checkCudaErrors(cudaDeviceSynchronize());
        checkCudaErrors(cudaFree(data));
        checkCudaErrors(cudaFree(result));
        checkCudaErrors(cudaFree(vector));
    }
    void allocate(const unsigned int s, const unsigned int unique_id)
    {
        // allocate unified memory outside of constructor
        id   = unique_id;
        size = s;
        checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
        checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
        checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
        checkCudaErrors(cudaDeviceSynchronize());
        // populate data with random elements
        for (unsigned int i = 0; i < size * size; i++) {
            data[i] = drand48();
        }
        for (unsigned int i = 0; i < size; i++) {
            result[i] = 0.;
            vector[i] = drand48();
        }
    }
  }
 };
 #ifdef USE_PTHREADS
-struct threadData_t {
+struct threadData_t
-  int tid;
+{
-  Task<double> *TaskListPtr;
+    int             tid;
-  cudaStream_t *streams;
+    Task<double>   *TaskListPtr;
-  cublasHandle_t *handles;
+    cudaStream_t   *streams;
-  int taskSize;
+    cublasHandle_t *handles;
    int             taskSize;
 };
 typedef struct threadData_t threadData;
 #endif
 // simple host dgemv: assume data is in row-major format and square
-template <typename T>
+template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
-void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
+{
-  // rows
+    // rows
-  for (int i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
-    result[i] *= beta;
+        result[i] *= beta;
-    for (int j = 0; j < n; j++) {
+        for (int j = 0; j < n; j++) {
-      result[i] += A[i * n + j] * x[j];
+            result[i] += A[i * n + j] * x[j];
        }
    }
  }
 }
 // execute a single task on either host or device depending on size
 #ifdef USE_PTHREADS
-void *execute(void *inpArgs) {
+void *execute(void *inpArgs)
-  threadData *dataPtr = (threadData *)inpArgs;
+{
-  cudaStream_t *stream = dataPtr->streams;
+    threadData     *dataPtr = (threadData *)inpArgs;
-  cublasHandle_t *handle = dataPtr->handles;
+    cudaStream_t   *stream  = dataPtr->streams;
-  int tid = dataPtr->tid;
+    cublasHandle_t *handle  = dataPtr->handles;
    int             tid     = dataPtr->tid;
-  for (int i = 0; i < dataPtr->taskSize; i++) {
+    for (int i = 0; i < dataPtr->taskSize; i++) {
-    Task<double> &t = dataPtr->TaskListPtr[i];
+        Task<double> &t = dataPtr->TaskListPtr[i];
-    if (t.size < 100) {
+        if (t.size < 100) {
-      // perform on host
+            // perform on host
-      printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
+            printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
             t.size);
-      // attach managed memory to a (dummy) stream to allow host access while
+            // attach managed memory to a (dummy) stream to allow host access while
-      // the device is running
+            // the device is running
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-          cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-          cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+            // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-      checkCudaErrors(
+            checkCudaErrors(cudaStreamSynchronize(stream[0]));
-          cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+            // call the host operation
-      // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+            gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-      checkCudaErrors(cudaStreamSynchronize(stream[0]));
+        }
-      // call the host operation
+        else {
-      gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+            // perform on device
-    } else {
+            printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
-      // perform on device
+            double one  = 1.0;
-      printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+            double zero = 0.0;
             t.size);
      double one = 1.0;
      double zero = 0.0;
-      // attach managed memory to my stream
+            // attach managed memory to my stream
-      checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+            checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-                                               cudaMemAttachSingle));
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+            checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
-                                               cudaMemAttachSingle));
+            // call the device operation
-      checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+            checkCudaErrors(cublasDgemv(
-                                               cudaMemAttachSingle));
+                handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
-      // call the device operation
+        }
      checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
                                  &one, t.data, t.size, t.vector, 1, &zero,
                                  t.result, 1));
    }
  }
-  pthread_exit(NULL);
+    pthread_exit(NULL);
 }
 #else
-template <typename T>
+template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
-void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
+{
-             int tid) {
+    if (t.size < 100) {
-  if (t.size < 100) {
+        // perform on host
-    // perform on host
+        printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
    printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
           t.size);
-    // attach managed memory to a (dummy) stream to allow host access while the
+        // attach managed memory to a (dummy) stream to allow host access while the
-    // device is running
+        // device is running
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
-        cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
-        cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
+        // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamSynchronize(stream[0]));
-        cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
+        // call the host operation
-    // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
+        gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
-    checkCudaErrors(cudaStreamSynchronize(stream[0]));
+    }
-    // call the host operation
+    else {
-    gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
+        // perform on device
-  } else {
+        printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
-    // perform on device
+        double one  = 1.0;
-    printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
+        double zero = 0.0;
           t.size);
    double one = 1.0;
    double zero = 0.0;
-    // attach managed memory to my stream
+        // attach managed memory to my stream
-    checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
+        checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
-                                             cudaMemAttachSingle));
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
+        checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
-                                             cudaMemAttachSingle));
+        // call the device operation
-    checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
+        checkCudaErrors(cublasDgemv(
-                                             cudaMemAttachSingle));
+            handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
-    // call the device operation
+    }
    checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
                                &one, t.data, t.size, t.vector, 1, &zero,
                                t.result, 1));
  }
 }
 #endif
 // populate a list of tasks with random sizes
-template <typename T>
+template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
-void initialise_tasks(std::vector<Task<T> > &TaskList) {
+{
-  for (unsigned int i = 0; i < TaskList.size(); i++) {
+    for (unsigned int i = 0; i < TaskList.size(); i++) {
-    // generate random size
+        // generate random size
-    int size;
+        int size;
-    size = std::max((int)(drand48() * 1000.0), 64);
+        size = std::max((int)(drand48() * 1000.0), 64);
-    TaskList[i].allocate(size, i);
+        TaskList[i].allocate(size, i);
-  }
+    }
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  // set device
+{
-  cudaDeviceProp device_prop;
+    // set device
-  int dev_id = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp device_prop;
-  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
+    int            dev_id = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
-  if (!device_prop.managedMemory) {
+    if (!device_prop.managedMemory) {
-    // This samples requires being run on a device that supports Unified Memory
+        // This samples requires being run on a device that supports Unified Memory
-    fprintf(stderr, "Unified Memory not supported on this device\n");
+        fprintf(stderr, "Unified Memory not supported on this device\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
-  }
+    }
-  if (device_prop.computeMode == cudaComputeModeProhibited) {
+    if (device_prop.computeMode == cudaComputeModeProhibited) {
-    // This sample requires being run with a default or process exclusive mode
+        // This sample requires being run with a default or process exclusive mode
-    fprintf(stderr,
+        fprintf(stderr,
-            "This sample requires a device in either default or process "
+                "This sample requires a device in either default or process "
-            "exclusive mode\n");
+                "exclusive mode\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
-  }
+    }
-  // randomise task sizes
+    // randomise task sizes
-  int seed = (int)time(NULL);
+    int seed = (int)time(NULL);
-  srand48(seed);
+    srand48(seed);
-  // set number of threads
+    // set number of threads
-  const int nthreads = 4;
+    const int nthreads = 4;
-  // number of streams = number of threads
+    // number of streams = number of threads
-  cudaStream_t *streams = new cudaStream_t[nthreads + 1];
+    cudaStream_t   *streams = new cudaStream_t[nthreads + 1];
-  cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
+    cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
-  for (int i = 0; i < nthreads + 1; i++) {
+    for (int i = 0; i < nthreads + 1; i++) {
-    checkCudaErrors(cudaStreamCreate(&streams[i]));
+        checkCudaErrors(cudaStreamCreate(&streams[i]));
-    checkCudaErrors(cublasCreate(&handles[i]));
+        checkCudaErrors(cublasCreate(&handles[i]));
-  }
+    }
-  // create list of N tasks
+    // create list of N tasks
-  unsigned int N = 40;
+    unsigned int              N = 40;
-  std::vector<Task<double> > TaskList(N);
+    std::vector<Task<double>> TaskList(N);
-  initialise_tasks(TaskList);
+    initialise_tasks(TaskList);
-  printf("Executing tasks on host / device\n");
+    printf("Executing tasks on host / device\n");
 // run through all tasks using threads and streams
 #ifdef USE_PTHREADS
-  pthread_t threads[nthreads];
+    pthread_t   threads[nthreads];
-  threadData *InputToThreads = new threadData[nthreads];
+    threadData *InputToThreads = new threadData[nthreads];
-  for (int i = 0; i < nthreads; i++) {
+    for (int i = 0; i < nthreads; i++) {
-    checkCudaErrors(cudaSetDevice(dev_id));
+        checkCudaErrors(cudaSetDevice(dev_id));
-    InputToThreads[i].tid = i;
+        InputToThreads[i].tid     = i;
-    InputToThreads[i].streams = streams;
+        InputToThreads[i].streams = streams;
-    InputToThreads[i].handles = handles;
+        InputToThreads[i].handles = handles;
-    if ((TaskList.size() / nthreads) == 0) {
+        if ((TaskList.size() / nthreads) == 0) {
-      InputToThreads[i].taskSize = (TaskList.size() / nthreads);
+            InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-      InputToThreads[i].TaskListPtr =
+            InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-          &TaskList[i * (TaskList.size() / nthreads)];
+        }
-    } else {
+        else {
-      if (i == nthreads - 1) {
+            if (i == nthreads - 1) {
-        InputToThreads[i].taskSize =
+                InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
-            (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
+                InputToThreads[i].TaskListPtr =
-        InputToThreads[i].TaskListPtr =
+                    &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
-            &TaskList[i * (TaskList.size() / nthreads) +
+            }
-                      (TaskList.size() % nthreads)];
+            else {
-      } else {
+                InputToThreads[i].taskSize    = (TaskList.size() / nthreads);
-        InputToThreads[i].taskSize = (TaskList.size() / nthreads);
+                InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
-        InputToThreads[i].TaskListPtr =
+            }
-            &TaskList[i * (TaskList.size() / nthreads)];
+        }
-      }
+
        pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
    }
    for (int i = 0; i < nthreads; i++) {
        pthread_join(threads[i], NULL);
    }
    pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
  }
  for (int i = 0; i < nthreads; i++) {
    pthread_join(threads[i], NULL);
  }
 #else
-  omp_set_num_threads(nthreads);
+    omp_set_num_threads(nthreads);
 #pragma omp parallel for schedule(dynamic)
-  for (int i = 0; i < TaskList.size(); i++) {
+    for (int i = 0; i < TaskList.size(); i++) {
-    checkCudaErrors(cudaSetDevice(dev_id));
+        checkCudaErrors(cudaSetDevice(dev_id));
-    int tid = omp_get_thread_num();
+        int tid = omp_get_thread_num();
-    execute(TaskList[i], handles, streams, tid);
+        execute(TaskList[i], handles, streams, tid);
-  }
+    }
 #endif
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
-  // Destroy CUDA Streams, cuBlas handles
+    // Destroy CUDA Streams, cuBlas handles
-  for (int i = 0; i < nthreads + 1; i++) {
+    for (int i = 0; i < nthreads + 1; i++) {
-    cudaStreamDestroy(streams[i]);
+        cudaStreamDestroy(streams[i]);
-    cublasDestroy(handles[i]);
+        cublasDestroy(handles[i]);
-  }
+    }
-  // Free TaskList
+    // Free TaskList
-  std::vector<Task<double> >().swap(TaskList);
+    std::vector<Task<double>>().swap(TaskList);
-  printf("All Done!\n");
+    printf("All Done!\n");
-  exit(EXIT_SUCCESS);
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/asyncAPI/asyncAPI.cu
+++ b/Samples/0_Introduction/asyncAPI/asyncAPI.cu
@ -38,105 +38,107 @@
 #include <stdio.h>
 // includes CUDA Runtime
 #include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper utility functions
+#include <helper_functions.h> // helper utility functions
-__global__ void increment_kernel(int *g_data, int inc_value) {
+__global__ void increment_kernel(int *g_data, int inc_value)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  g_data[idx] = g_data[idx] + inc_value;
+    int idx     = blockIdx.x * blockDim.x + threadIdx.x;
    g_data[idx] = g_data[idx] + inc_value;
 }
-bool correct_output(int *data, const int n, const int x) {
+bool correct_output(int *data, const int n, const int x)
-  for (int i = 0; i < n; i++)
+{
-    if (data[i] != x) {
+    for (int i = 0; i < n; i++)
-      printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
+        if (data[i] != x) {
-      return false;
+            printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
            return false;
        }
    return true;
 }
 int main(int argc, char *argv[])
 {
    int            devID;
    cudaDeviceProp deviceProps;
    printf("[%s] - Starting...\n", argv[0]);
    // This will pick the best possible CUDA capable device
    devID = findCudaDevice(argc, (const char **)argv);
    // get device name
    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
    printf("CUDA device [%s]\n", deviceProps.name);
    int n      = 16 * 1024 * 1024;
    int nbytes = n * sizeof(int);
    int value  = 26;
    // allocate host memory
    int *a = 0;
    checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
    memset(a, 0, nbytes);
    // allocate device memory
    int *d_a = 0;
    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 255, nbytes));
    // set kernel launch configuration
    dim3 threads = dim3(512, 1);
    dim3 blocks  = dim3(n / threads.x, 1);
    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkResetTimer(&timer);
    checkCudaErrors(cudaDeviceSynchronize());
    float gpu_time = 0.0f;
    // asynchronously issue work to the GPU (all to stream 0)
    checkCudaErrors(cudaProfilerStart());
    sdkStartTimer(&timer);
    cudaEventRecord(start, 0);
    cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
    increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
    cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
    cudaEventRecord(stop, 0);
    sdkStopTimer(&timer);
    checkCudaErrors(cudaProfilerStop());
    // have CPU do some work while waiting for stage 1 to finish
    unsigned long int counter = 0;
    while (cudaEventQuery(stop) == cudaErrorNotReady) {
        counter++;
    }
-  return true;
+    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
-}
+
-
+    // print the cpu and gpu times
-int main(int argc, char *argv[]) {
+    printf("time spent executing by the GPU: %.2f\n", gpu_time);
-  int devID;
+    printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
-  cudaDeviceProp deviceProps;
+    printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
-
+
-  printf("[%s] - Starting...\n", argv[0]);
+    // check the output for correctness
-
+    bool bFinalResults = correct_output(a, n, value);
-  // This will pick the best possible CUDA capable device
+
-  devID = findCudaDevice(argc, (const char **)argv);
+    // release resources
-
+    checkCudaErrors(cudaEventDestroy(start));
-  // get device name
+    checkCudaErrors(cudaEventDestroy(stop));
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    checkCudaErrors(cudaFreeHost(a));
-  printf("CUDA device [%s]\n", deviceProps.name);
+    checkCudaErrors(cudaFree(d_a));
-
+
-  int n = 16 * 1024 * 1024;
+    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
  int nbytes = n * sizeof(int);
  int value = 26;
  // allocate host memory
  int *a = 0;
  checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
  memset(a, 0, nbytes);
  // allocate device memory
  int *d_a = 0;
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
  checkCudaErrors(cudaMemset(d_a, 255, nbytes));
  // set kernel launch configuration
  dim3 threads = dim3(512, 1);
  dim3 blocks = dim3(n / threads.x, 1);
  // create cuda event handles
  cudaEvent_t start, stop;
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));
  StopWatchInterface *timer = NULL;
  sdkCreateTimer(&timer);
  sdkResetTimer(&timer);
  checkCudaErrors(cudaDeviceSynchronize());
  float gpu_time = 0.0f;
  // asynchronously issue work to the GPU (all to stream 0)
  checkCudaErrors(cudaProfilerStart());
  sdkStartTimer(&timer);
  cudaEventRecord(start, 0);
  cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
  increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
  cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
  cudaEventRecord(stop, 0);
  sdkStopTimer(&timer);
  checkCudaErrors(cudaProfilerStop());
  // have CPU do some work while waiting for stage 1 to finish
  unsigned long int counter = 0;
  while (cudaEventQuery(stop) == cudaErrorNotReady) {
    counter++;
  }
  checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
  // print the cpu and gpu times
  printf("time spent executing by the GPU: %.2f\n", gpu_time);
  printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
  printf("CPU executed %lu iterations while waiting for GPU to finish\n",
         counter);
  // check the output for correctness
  bool bFinalResults = correct_output(a, n, value);
  // release resources
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaFreeHost(a));
  checkCudaErrors(cudaFree(d_a));
  exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/clock/clock.cu
+++ b/Samples/0_Introduction/clock/clock.cu
@ -48,43 +48,46 @@
 // This kernel computes a standard parallel reduction and evaluates the
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-__global__ static void timedReduction(const float *input, float *output,
+__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
-                                      clock_t *timer) {
+{
-  // __shared__ float shared[2 * blockDim.x];
+    // __shared__ float shared[2 * blockDim.x];
-  extern __shared__ float shared[];
+    extern __shared__ float shared[];
-  const int tid = threadIdx.x;
+    const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
        timer[bid] = clock();
-  // Copy input.
+    // Copy input.
-  shared[tid] = input[tid];
+    shared[tid]              = input[tid];
-  shared[tid + blockDim.x] = input[tid + blockDim.x];
+    shared[tid + blockDim.x] = input[tid + blockDim.x];
    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2) {
        __syncthreads();
        if (tid < d) {
            float f0 = shared[tid];
            float f1 = shared[tid + d];
            if (f1 < f0) {
                shared[tid] = f1;
            }
        }
    }
    // Write result.
    if (tid == 0)
        output[bid] = shared[0];
  // Perform reduction to find minimum.
  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();
-    if (tid < d) {
+    if (tid == 0)
-      float f0 = shared[tid];
+        timer[bid + gridDim.x] = clock();
      float f1 = shared[tid + d];
      if (f1 < f0) {
        shared[tid] = f1;
      }
    }
  }
  // Write result.
  if (tid == 0) output[bid] = shared[0];
  __syncthreads();
  if (tid == 0) timer[bid + gridDim.x] = clock();
 }
-#define NUM_BLOCKS 64
+#define NUM_BLOCKS  64
 #define NUM_THREADS 256
 // It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
 // the memory. With more than 32 the speed scales linearly.
 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("CUDA Clock sample\n");
+{
    printf("CUDA Clock sample\n");
-  // This will pick the best possible CUDA capable device
+    // This will pick the best possible CUDA capable device
-  int dev = findCudaDevice(argc, (const char **)argv);
+    int dev = findCudaDevice(argc, (const char **)argv);
-  float *dinput = NULL;
+    float   *dinput  = NULL;
-  float *doutput = NULL;
+    float   *doutput = NULL;
-  clock_t *dtimer = NULL;
+    clock_t *dtimer  = NULL;
-  clock_t timer[NUM_BLOCKS * 2];
+    clock_t timer[NUM_BLOCKS * 2];
-  float input[NUM_THREADS * 2];
+    float   input[NUM_THREADS * 2];
-  for (int i = 0; i < NUM_THREADS * 2; i++) {
+    for (int i = 0; i < NUM_THREADS * 2; i++) {
-    input[i] = (float)i;
+        input[i] = (float)i;
-  }
+    }
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
-      cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
+    checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
  checkCudaErrors(
      cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
+    checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
                             cudaMemcpyHostToDevice));
-  timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
+    timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
      dinput, doutput, dtimer);
-  checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
+    checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
-  checkCudaErrors(cudaFree(dinput));
+    checkCudaErrors(cudaFree(dinput));
-  checkCudaErrors(cudaFree(doutput));
+    checkCudaErrors(cudaFree(doutput));
-  checkCudaErrors(cudaFree(dtimer));
+    checkCudaErrors(cudaFree(dtimer));
-  long double avgElapsedClocks = 0;
+    long double avgElapsedClocks = 0;
-  for (int i = 0; i < NUM_BLOCKS; i++) {
+    for (int i = 0; i < NUM_BLOCKS; i++) {
-    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-  }
+    }
-  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock.cpp
+++ b/Samples/0_Introduction/clock_nvrtc/clock.cpp
@ -34,12 +34,11 @@
 */
 // System includes
 #include <stdio.h>
 #include <stdint.h>
 #include <assert.h>
 #include <cuda_runtime.h>
 #include <nvrtc_helper.h>
 #include <stdint.h>
 #include <stdio.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
@ -71,64 +70,68 @@
 // Start the main CUDA Sample here
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("CUDA Clock sample\n");
+{
    printf("CUDA Clock sample\n");
-  typedef long clock_t;
+    typedef long clock_t;
-  clock_t timer[NUM_BLOCKS * 2];
+    clock_t timer[NUM_BLOCKS * 2];
-  float input[NUM_THREADS * 2];
+    float input[NUM_THREADS * 2];
-  for (int i = 0; i < NUM_THREADS * 2; i++) {
+    for (int i = 0; i < NUM_THREADS * 2; i++) {
-    input[i] = (float)i;
+        input[i] = (float)i;
-  }
+    }
-  char *cubin, *kernel_file;
+    char  *cubin, *kernel_file;
-  size_t cubinSize;
+    size_t cubinSize;
-  kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
+    kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUfunction kernel_addr;
-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
-  dim3 cudaBlockSize(NUM_THREADS, 1, 1);
+    dim3 cudaBlockSize(NUM_THREADS, 1, 1);
-  dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
+    dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
-  CUdeviceptr dinput, doutput, dtimer;
+    CUdeviceptr dinput, doutput, dtimer;
-  checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
-  checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
+    checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
-  checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-  checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
+    checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
-  void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
+    void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
-  checkCudaErrors(cuLaunchKernel(
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
-      kernel_addr, cudaGridSize.x, cudaGridSize.y,
+                                   cudaGridSize.x,
-      cudaGridSize.z,                                    /* grid dim */
+                                   cudaGridSize.y,
-      cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
+                                   cudaGridSize.z, /* grid dim */
-      sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
+                                   cudaBlockSize.x,
-      &arr[0],                            /* arguments */
+                                   cudaBlockSize.y,
-      0));
+                                   cudaBlockSize.z, /* block dim */
                                   sizeof(float) * 2 * NUM_THREADS,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
-  checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());
-  checkCudaErrors(
+    checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
-      cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
+    checkCudaErrors(cuMemFree(dinput));
-  checkCudaErrors(cuMemFree(dinput));
+    checkCudaErrors(cuMemFree(doutput));
-  checkCudaErrors(cuMemFree(doutput));
+    checkCudaErrors(cuMemFree(dtimer));
  checkCudaErrors(cuMemFree(dtimer));
-  long double avgElapsedClocks = 0;
+    long double avgElapsedClocks = 0;
-  for (int i = 0; i < NUM_BLOCKS; i++) {
+    for (int i = 0; i < NUM_BLOCKS; i++) {
-    avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
+        avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
-  }
+    }
-  avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
+    avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
-  printf("Average clocks/block = %Lf\n", avgElapsedClocks);
+    printf("Average clocks/block = %Lf\n", avgElapsedClocks);
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
+++ b/Samples/0_Introduction/clock_nvrtc/clock_kernel.cu
@ -37,38 +37,41 @@
 // time it takes to do that for each block. The timing results are stored
 // in device memory.
-extern "C" __global__ void timedReduction(const float *input, float *output,
+extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
-                                          clock_t *timer) {
+{
-  // __shared__ float shared[2 * blockDim.x];
+    // __shared__ float shared[2 * blockDim.x];
-  extern __shared__ float shared[];
+    extern __shared__ float shared[];
-  const int tid = threadIdx.x;
+    const int tid = threadIdx.x;
-  const int bid = blockIdx.x;
+    const int bid = blockIdx.x;
-  if (tid == 0) timer[bid] = clock();
+    if (tid == 0)
        timer[bid] = clock();
-  // Copy input.
+    // Copy input.
-  shared[tid] = input[tid];
+    shared[tid]              = input[tid];
-  shared[tid + blockDim.x] = input[tid + blockDim.x];
+    shared[tid + blockDim.x] = input[tid + blockDim.x];
    // Perform reduction to find minimum.
    for (int d = blockDim.x; d > 0; d /= 2) {
        __syncthreads();
        if (tid < d) {
            float f0 = shared[tid];
            float f1 = shared[tid + d];
            if (f1 < f0) {
                shared[tid] = f1;
            }
        }
    }
    // Write result.
    if (tid == 0)
        output[bid] = shared[0];
  // Perform reduction to find minimum.
  for (int d = blockDim.x; d > 0; d /= 2) {
    __syncthreads();
-    if (tid < d) {
+    if (tid == 0)
-      float f0 = shared[tid];
+        timer[bid + gridDim.x] = clock();
      float f1 = shared[tid + d];
      if (f1 < f0) {
        shared[tid] = f1;
      }
    }
  }
  // Write result.
  if (tid == 0) output[bid] = shared[0];
  __syncthreads();
  if (tid == 0) timer[bid + gridDim.x] = clock();
 }
--- a/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
+++ b/Samples/0_Introduction/cudaOpenMP/cudaOpenMP.cu
@ -32,128 +32,125 @@
 #include <helper_cuda.h>
 #include <omp.h>
-#include <stdio.h>  // stdio functions are used since C++ streams aren't necessarily thread safe
+#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
 using namespace std;
 // a simple kernel that simply increments each array element by b
-__global__ void kernelAddConstant(int *g_a, const int b) {
+__global__ void kernelAddConstant(int *g_a, const int b)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  g_a[idx] += b;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_a[idx] += b;
 }
 // a predicate that checks whether each array element is set to its index plus b
-int correctResult(int *data, const int n, const int b) {
+int correctResult(int *data, const int n, const int b)
-  for (int i = 0; i < n; i++)
+{
-    if (data[i] != i + b) return 0;
+    for (int i = 0; i < n; i++)
        if (data[i] != i + b)
            return 0;
-  return 1;
+    return 1;
 }
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
-  int num_gpus = 0;  // number of CUDA GPUs
+{
    int num_gpus = 0; // number of CUDA GPUs
-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);
-  /////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////
-  // determine the number of CUDA capable GPUs
+    // determine the number of CUDA capable GPUs
-  //
+    //
-  cudaGetDeviceCount(&num_gpus);
+    cudaGetDeviceCount(&num_gpus);
-  if (num_gpus < 1) {
+    if (num_gpus < 1) {
-    printf("no CUDA capable devices were detected\n");
+        printf("no CUDA capable devices were detected\n");
-    return 1;
+        return 1;
-  }
+    }
-  /////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////
-  // display CPU and GPU configuration
+    // display CPU and GPU configuration
-  //
+    //
-  printf("number of host CPUs:\t%d\n", omp_get_num_procs());
+    printf("number of host CPUs:\t%d\n", omp_get_num_procs());
-  printf("number of CUDA devices:\t%d\n", num_gpus);
+    printf("number of CUDA devices:\t%d\n", num_gpus);
-  for (int i = 0; i < num_gpus; i++) {
+    for (int i = 0; i < num_gpus; i++) {
-    cudaDeviceProp dprop;
+        cudaDeviceProp dprop;
-    cudaGetDeviceProperties(&dprop, i);
+        cudaGetDeviceProperties(&dprop, i);
-    printf("   %d: %s\n", i, dprop.name);
+        printf("   %d: %s\n", i, dprop.name);
-  }
+    }
-  printf("---------------------------\n");
+    printf("---------------------------\n");
-  /////////////////////////////////////////////////////////////////
+    /////////////////////////////////////////////////////////////////
-  // initialize data
+    // initialize data
-  //
+    //
-  unsigned int n = num_gpus * 8192;
+    unsigned int n      = num_gpus * 8192;
-  unsigned int nbytes = n * sizeof(int);
+    unsigned int nbytes = n * sizeof(int);
-  int *a = 0;  // pointer to data on the CPU
+    int         *a      = 0; // pointer to data on the CPU
-  int b = 3;   // value by which the array is incremented
+    int          b      = 3; // value by which the array is incremented
-  a = (int *)malloc(nbytes);
+    a                   = (int *)malloc(nbytes);
-  if (0 == a) {
+    if (0 == a) {
-    printf("couldn't allocate CPU memory\n");
+        printf("couldn't allocate CPU memory\n");
-    return 1;
+        return 1;
-  }
+    }
-  for (unsigned int i = 0; i < n; i++) a[i] = i;
+    for (unsigned int i = 0; i < n; i++)
        a[i] = i;
-  ////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////
-  // run as many CPU threads as there are CUDA devices
+    // run as many CPU threads as there are CUDA devices
-  //   each CPU thread controls a different device, processing its
+    //   each CPU thread controls a different device, processing its
-  //   portion of the data.  It's possible to use more CPU threads
+    //   portion of the data.  It's possible to use more CPU threads
-  //   than there are CUDA devices, in which case several CPU
+    //   than there are CUDA devices, in which case several CPU
-  //   threads will be allocating resources and launching kernels
+    //   threads will be allocating resources and launching kernels
-  //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
+    //   on the same device.  For example, try omp_set_num_threads(2*num_gpus);
-  //   Recall that all variables declared inside an "omp parallel" scope are
+    //   Recall that all variables declared inside an "omp parallel" scope are
-  //   local to each CPU thread
+    //   local to each CPU thread
-  //
+    //
-  omp_set_num_threads(
+    omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
      num_gpus);  // create as many CPU threads as there are CUDA devices
 // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
 // are CUDA devices
 #pragma omp parallel
-  {
+    {
-    unsigned int cpu_thread_id = omp_get_thread_num();
+        unsigned int cpu_thread_id   = omp_get_thread_num();
-    unsigned int num_cpu_threads = omp_get_num_threads();
+        unsigned int num_cpu_threads = omp_get_num_threads();
-    // set and check the CUDA device for this CPU thread
+        // set and check the CUDA device for this CPU thread
-    int gpu_id = -1;
+        int gpu_id = -1;
-    checkCudaErrors(cudaSetDevice(
+        checkCudaErrors(
-        cpu_thread_id %
+            cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
-        num_gpus));  // "% num_gpus" allows more CPU threads than GPU devices
+        checkCudaErrors(cudaGetDevice(&gpu_id));
-    checkCudaErrors(cudaGetDevice(&gpu_id));
+        printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
    printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
           num_cpu_threads, gpu_id);
-    int *d_a =
+        int         *d_a   = 0; // pointer to memory on the device associated with this CPU thread
-        0;  // pointer to memory on the device associated with this CPU thread
+        int         *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
-    int *sub_a =
+        unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
-        a +
+        dim3         gpu_threads(128); // 128 threads per block
-        cpu_thread_id * n /
+        dim3         gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
            num_cpu_threads;  // pointer to this CPU thread's portion of data
    unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
    dim3 gpu_threads(128);  // 128 threads per block
    dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
-    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
+        checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
-    checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
+        checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
-        cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
+        kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
    kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
-        cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
+        checkCudaErrors(cudaFree(d_a));
-    checkCudaErrors(cudaFree(d_a));
+    }
-  }
+    printf("---------------------------\n");
  printf("---------------------------\n");
-  if (cudaSuccess != cudaGetLastError())
+    if (cudaSuccess != cudaGetLastError())
-    printf("%s\n", cudaGetErrorString(cudaGetLastError()));
+        printf("%s\n", cudaGetErrorString(cudaGetLastError()));
-  ////////////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////////////
-  // check the result
+    // check the result
-  //
+    //
-  bool bResult = correctResult(a, n, b);
+    bool bResult = correctResult(a, n, b);
-  if (a) free(a);  // free CPU memory
+    if (a)
        free(a); // free CPU memory
-  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
+++ b/Samples/0_Introduction/fp16ScalarProduct/fp16ScalarProduct.cu
@ -25,191 +25,188 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include "cuda_fp16.h"
 #include "helper_cuda.h"
 #include <cstdio>
 #include <cstdlib>
 #include <ctime>
-#define NUM_OF_BLOCKS 128
+#include "cuda_fp16.h"
 #include "helper_cuda.h"
 #define NUM_OF_BLOCKS  128
 #define NUM_OF_THREADS 128
-__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
+__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
-  if (threadIdx.x < 64)
+{
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
+    if (threadIdx.x < 64)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
-  if (threadIdx.x < 32)
+    __syncthreads();
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
+    if (threadIdx.x < 32)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
-  if (threadIdx.x < 16)
+    __syncthreads();
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
+    if (threadIdx.x < 16)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
-  if (threadIdx.x < 8)
+    __syncthreads();
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
+    if (threadIdx.x < 8)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
-  if (threadIdx.x < 4)
+    __syncthreads();
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
+    if (threadIdx.x < 4)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
-  if (threadIdx.x < 2)
+    __syncthreads();
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
+    if (threadIdx.x < 2)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
-  if (threadIdx.x < 1)
+    __syncthreads();
-    v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
+    if (threadIdx.x < 1)
-  __syncthreads();
+        v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
    __syncthreads();
 }
-__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
+__forceinline__ __device__ void reduceInShared_native(half2 *const v)
-  if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
+{
-  __syncthreads();
+    if (threadIdx.x < 64)
-  if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
-  __syncthreads();
+    __syncthreads();
-  if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
+    if (threadIdx.x < 32)
-  __syncthreads();
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
-  if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
+    __syncthreads();
-  __syncthreads();
+    if (threadIdx.x < 16)
-  if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
-  __syncthreads();
+    __syncthreads();
-  if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
+    if (threadIdx.x < 8)
-  __syncthreads();
+        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
-  if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
+    __syncthreads();
-  __syncthreads();
+    if (threadIdx.x < 4)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
    __syncthreads();
    if (threadIdx.x < 2)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
    __syncthreads();
    if (threadIdx.x < 1)
        v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
    __syncthreads();
 }
-__global__ void scalarProductKernel_intrinsics(half2 const *const a,
+__global__ void
-                                               half2 const *const b,
+scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
-                                               float *const results,
+{
-                                               size_t const size) {
+    const int        stride = gridDim.x * blockDim.x;
-  const int stride = gridDim.x * blockDim.x;
+    __shared__ half2 shArray[NUM_OF_THREADS];
  __shared__ half2 shArray[NUM_OF_THREADS];
-  shArray[threadIdx.x] = __float2half2_rn(0.f);
+    shArray[threadIdx.x] = __float2half2_rn(0.f);
-  half2 value = __float2half2_rn(0.f);
+    half2 value          = __float2half2_rn(0.f);
-  for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
+    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
-    value = __hfma2(a[i], b[i], value);
+        value = __hfma2(a[i], b[i], value);
-  }
+    }
-  shArray[threadIdx.x] = value;
+    shArray[threadIdx.x] = value;
-  __syncthreads();
+    __syncthreads();
-  reduceInShared_intrinsics(shArray);
+    reduceInShared_intrinsics(shArray);
-  if (threadIdx.x == 0) {
+    if (threadIdx.x == 0) {
-    half2 result = shArray[0];
+        half2 result        = shArray[0];
-    float f_result = __low2float(result) + __high2float(result);
+        float f_result      = __low2float(result) + __high2float(result);
-    results[blockIdx.x] = f_result;
+        results[blockIdx.x] = f_result;
-  }
+    }
 }
-__global__ void scalarProductKernel_native(half2 const *const a,
+__global__ void
-                                           half2 const *const b,
+scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
-                                           float *const results,
+{
-                                           size_t const size) {
+    const int        stride = gridDim.x * blockDim.x;
-  const int stride = gridDim.x * blockDim.x;
+    __shared__ half2 shArray[NUM_OF_THREADS];
  __shared__ half2 shArray[NUM_OF_THREADS];
-  half2 value(0.f, 0.f);
+    half2 value(0.f, 0.f);
-  shArray[threadIdx.x] = value;
+    shArray[threadIdx.x] = value;
-  for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
+    for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
-    value = a[i] * b[i] + value;
+        value = a[i] * b[i] + value;
-  }
+    }
-  shArray[threadIdx.x] = value;
+    shArray[threadIdx.x] = value;
-  __syncthreads();
+    __syncthreads();
-  reduceInShared_native(shArray);
+    reduceInShared_native(shArray);
-  if (threadIdx.x == 0) {
+    if (threadIdx.x == 0) {
-    half2 result = shArray[0];
+        half2 result        = shArray[0];
-    float f_result = (float)result.y + (float)result.x;
+        float f_result      = (float)result.y + (float)result.x;
-    results[blockIdx.x] = f_result;
+        results[blockIdx.x] = f_result;
-  }
+    }
 }
-void generateInput(half2 *a, size_t size) {
+void generateInput(half2 *a, size_t size)
-  for (size_t i = 0; i < size; ++i) {
+{
-    half2 temp;
+    for (size_t i = 0; i < size; ++i) {
-    temp.x = static_cast<float>(rand() % 4);
+        half2 temp;
-    temp.y = static_cast<float>(rand() % 2);
+        temp.x = static_cast<float>(rand() % 4);
-    a[i] = temp;
+        temp.y = static_cast<float>(rand() % 2);
-  }
+        a[i]   = temp;
    }
 }
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
-  srand((unsigned int)time(NULL));
+{
-  size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
+    srand((unsigned int)time(NULL));
    size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
-  half2 *vec[2];
+    half2 *vec[2];
-  half2 *devVec[2];
+    half2 *devVec[2];
-  float *results;
+    float *results;
-  float *devResults;
+    float *devResults;
-  int devID = findCudaDevice(argc, (const char **)argv);
+    int devID = findCudaDevice(argc, (const char **)argv);
-  cudaDeviceProp devProp;
+    cudaDeviceProp devProp;
-  checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
-  if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
+    if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
-    printf(
+        printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
-        "ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
+               "higher.\n");
-        "higher.\n");
+        return EXIT_WAIVED;
-    return EXIT_WAIVED;
+    }
  }
-  for (int i = 0; i < 2; ++i) {
+    for (int i = 0; i < 2; ++i) {
-    checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
+        checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
-    checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
+        checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
-  }
+    }
-  checkCudaErrors(
+    checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
-      cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
+    checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
  checkCudaErrors(
      cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
-  for (int i = 0; i < 2; ++i) {
+    for (int i = 0; i < 2; ++i) {
-    generateInput(vec[i], size);
+        generateInput(vec[i], size);
-    checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
+        checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
-                               cudaMemcpyHostToDevice));
+    }
  }
-  scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
+    scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
      devVec[0], devVec[1], devResults, size);
-  checkCudaErrors(cudaMemcpy(results, devResults,
+    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
                             NUM_OF_BLOCKS * sizeof *results,
                             cudaMemcpyDeviceToHost));
-  float result_native = 0;
+    float result_native = 0;
-  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
+    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
-    result_native += results[i];
+        result_native += results[i];
-  }
+    }
-  printf("Result native operators\t: %f \n", result_native);
+    printf("Result native operators\t: %f \n", result_native);
-  scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
+    scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
      devVec[0], devVec[1], devResults, size);
-  checkCudaErrors(cudaMemcpy(results, devResults,
+    checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
                             NUM_OF_BLOCKS * sizeof *results,
                             cudaMemcpyDeviceToHost));
-  float result_intrinsics = 0;
+    float result_intrinsics = 0;
-  for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
+    for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
-    result_intrinsics += results[i];
+        result_intrinsics += results[i];
-  }
+    }
-  printf("Result intrinsics\t: %f \n", result_intrinsics);
+    printf("Result intrinsics\t: %f \n", result_intrinsics);
-  printf("&&&& fp16ScalarProduct %s\n",
+    printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
         (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
                                                             : "FAILED");
-  for (int i = 0; i < 2; ++i) {
+    for (int i = 0; i < 2; ++i) {
-    checkCudaErrors(cudaFree(devVec[i]));
+        checkCudaErrors(cudaFree(devVec[i]));
-    checkCudaErrors(cudaFreeHost(vec[i]));
+        checkCudaErrors(cudaFreeHost(vec[i]));
-  }
+    }
-  checkCudaErrors(cudaFree(devResults));
+    checkCudaErrors(cudaFree(devResults));
-  checkCudaErrors(cudaFreeHost(results));
+    checkCudaErrors(cudaFreeHost(results));
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/matrixMul/matrixMul.cu
+++ b/Samples/0_Introduction/matrixMul/matrixMul.cu
@ -40,314 +40,303 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 #include <cuda_profiler_api.h>
 #include <cuda_runtime.h>
 // Helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 /**
 * Matrix multiplication (CUDA Kernel) on the device: C = A * B
 * wA is A's width and wB is B's width
 */
-template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
+template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
-    float *B, int wA,
+{
-    int wB) {
+    // Block index
-  // Block index
+    int bx = blockIdx.x;
-  int bx = blockIdx.x;
+    int by = blockIdx.y;
  int by = blockIdx.y;
-  // Thread index
+    // Thread index
-  int tx = threadIdx.x;
+    int tx = threadIdx.x;
-  int ty = threadIdx.y;
+    int ty = threadIdx.y;
-  // Index of the first sub-matrix of A processed by the block
+    // Index of the first sub-matrix of A processed by the block
-  int aBegin = wA * BLOCK_SIZE * by;
+    int aBegin = wA * BLOCK_SIZE * by;
-  // Index of the last sub-matrix of A processed by the block
+    // Index of the last sub-matrix of A processed by the block
-  int aEnd   = aBegin + wA - 1;
+    int aEnd = aBegin + wA - 1;
-  // Step size used to iterate through the sub-matrices of A
+    // Step size used to iterate through the sub-matrices of A
-  int aStep  = BLOCK_SIZE;
+    int aStep = BLOCK_SIZE;
-  // Index of the first sub-matrix of B processed by the block
+    // Index of the first sub-matrix of B processed by the block
-  int bBegin = BLOCK_SIZE * bx;
+    int bBegin = BLOCK_SIZE * bx;
-  // Step size used to iterate through the sub-matrices of B
+    // Step size used to iterate through the sub-matrices of B
-  int bStep  = BLOCK_SIZE * wB;
+    int bStep = BLOCK_SIZE * wB;
-  // Csub is used to store the element of the block sub-matrix
+    // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
+    // that is computed by the thread
-  float Csub = 0;
+    float Csub = 0;
-  // Loop over all the sub-matrices of A and B
+    // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
+    // required to compute the block sub-matrix
-  for (int a = aBegin, b = bBegin;
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-       a <= aEnd;
+        // Declaration of the shared memory array As used to
-       a += aStep, b += bStep) {
+        // store the sub-matrix of A
-    // Declaration of the shared memory array As used to
+        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
    // store the sub-matrix of A
    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
-    // Declaration of the shared memory array Bs used to
+        // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
+        // store the sub-matrix of B
-    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
-    // Load the matrices from device memory
+        // Load the matrices from device memory
-    // to shared memory; each thread loads
+        // to shared memory; each thread loads
-    // one element of each matrix
+        // one element of each matrix
-    As[ty][tx] = A[a + wA * ty + tx];
+        As[ty][tx] = A[a + wA * ty + tx];
-    Bs[ty][tx] = B[b + wB * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];
-    // Synchronize to make sure the matrices are loaded
+        // Synchronize to make sure the matrices are loaded
-    __syncthreads();
+        __syncthreads();
-    // Multiply the two matrices together;
+        // Multiply the two matrices together;
-    // each thread computes one element
+        // each thread computes one element
-    // of the block sub-matrix
+        // of the block sub-matrix
 #pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
+            Csub += As[ty][k] * Bs[k][tx];
        }
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }
-    // Synchronize to make sure that the preceding
+    // Write the block sub-matrix to device memory;
-    // computation is done before loading two new
+    // each thread writes one element
-    // sub-matrices of A and B in the next iteration
+    int c               = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
-    __syncthreads();
+    C[c + wB * ty + tx] = Csub;
  }
  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
 }
-void ConstantInit(float *data, int size, float val) {
+void ConstantInit(float *data, int size, float val)
-  for (int i = 0; i < size; ++i) {
+{
-    data[i] = val;
+    for (int i = 0; i < size; ++i) {
-  }
+        data[i] = val;
    }
 }
 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int MatrixMultiply(int argc, char **argv,
+int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
-                   int block_size, const dim3 &dimsA,
+{
-                   const dim3 &dimsB) {
+    // Allocate host memory for matrices A and B
-  // Allocate host memory for matrices A and B
+    unsigned int size_A     = dimsA.x * dimsA.y;
-  unsigned int size_A = dimsA.x * dimsA.y;
+    unsigned int mem_size_A = sizeof(float) * size_A;
-  unsigned int mem_size_A = sizeof(float) * size_A;
+    float       *h_A;
-  float *h_A;
+    checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
-  checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
+    unsigned int size_B     = dimsB.x * dimsB.y;
-  unsigned int size_B = dimsB.x * dimsB.y;
+    unsigned int mem_size_B = sizeof(float) * size_B;
-  unsigned int mem_size_B = sizeof(float) * size_B;
+    float       *h_B;
-  float *h_B;
+    checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
-  checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
+    cudaStream_t stream;
  cudaStream_t stream;
-  // Initialize host memory
+    // Initialize host memory
-  const float valB = 0.01f;
+    const float valB = 0.01f;
-  ConstantInit(h_A, size_A, 1.0f);
+    ConstantInit(h_A, size_A, 1.0f);
-  ConstantInit(h_B, size_B, valB);
+    ConstantInit(h_B, size_B, valB);
-  // Allocate device memory
+    // Allocate device memory
-  float *d_A, *d_B, *d_C;
+    float *d_A, *d_B, *d_C;
-  // Allocate host matrix C
+    // Allocate host matrix C
-  dim3 dimsC(dimsB.x, dimsA.y, 1);
+    dim3         dimsC(dimsB.x, dimsA.y, 1);
-  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
-  float *h_C;
+    float       *h_C;
-  checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
+    checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
-  if (h_C == NULL) {
+    if (h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host matrix C!\n");
+        fprintf(stderr, "Failed to allocate host matrix C!\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
-  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
-  checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
+    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
-  // Allocate CUDA events that we'll use for timing
+    // Allocate CUDA events that we'll use for timing
-  cudaEvent_t start, stop;
+    cudaEvent_t start, stop;
-  checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&start));
-  checkCudaErrors(cudaEventCreate(&stop));
+    checkCudaErrors(cudaEventCreate(&stop));
-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  // copy host memory to device
+    // copy host memory to device
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
-      cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
  checkCudaErrors(
      cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
-  // Setup execution parameters
+    // Setup execution parameters
-  dim3 threads(block_size, block_size);
+    dim3 threads(block_size, block_size);
-  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
+    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
-  // Create and start timer
+    // Create and start timer
-  printf("Computing result using CUDA Kernel...\n");
+    printf("Computing result using CUDA Kernel...\n");
-  // Performs warmup operation using matrixMul CUDA kernel
+    // Performs warmup operation using matrixMul CUDA kernel
  if (block_size == 16) {
    MatrixMulCUDA<16>
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  } else {
    MatrixMulCUDA<32>
        <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  }
  printf("done\n");
  checkCudaErrors(cudaStreamSynchronize(stream));
  // Record the start event
  checkCudaErrors(cudaEventRecord(start, stream));
  // Execute the kernel
  int nIter = 300;
  for (int j = 0; j < nIter; j++) {
    if (block_size == 16) {
-      MatrixMulCUDA<16>
+        MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    } else {
      MatrixMulCUDA<32>
          <<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
    }
-  }
+    else {
-
+        MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
  // Record the stop event
  checkCudaErrors(cudaEventRecord(stop, stream));
  // Wait for the stop event to complete
  checkCudaErrors(cudaEventSynchronize(stop));
  float msecTotal = 0.0f;
  checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
  // Compute and print the performance
  float msecPerMatrixMul = msecTotal / nIter;
  double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
                             static_cast<double>(dimsA.y) *
                             static_cast<double>(dimsB.x);
  double gigaFlops =
      (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
  printf(
      "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
      " WorkgroupSize= %u threads/block\n",
      gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
  // Copy result from device to host
  checkCudaErrors(
      cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  printf("Checking computed result for correctness: ");
  bool correct = true;
  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  double eps = 1.e-6;  // machine zero
  for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
    double dot_length = dimsA.x;
    double abs_val = fabs(h_C[i]);
    double rel_err = abs_err / abs_val / dot_length;
    if (rel_err > eps) {
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
             i, h_C[i], dimsA.x * valB, eps);
      correct = false;
    }
  }
-  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+    printf("done\n");
    checkCudaErrors(cudaStreamSynchronize(stream));
-  // Clean up memory
+    // Record the start event
-  checkCudaErrors(cudaFreeHost(h_A));
+    checkCudaErrors(cudaEventRecord(start, stream));
  checkCudaErrors(cudaFreeHost(h_B));
  checkCudaErrors(cudaFreeHost(h_C));
  checkCudaErrors(cudaFree(d_A));
  checkCudaErrors(cudaFree(d_B));
  checkCudaErrors(cudaFree(d_C));
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaEventDestroy(stop));
  printf(
      "\nNOTE: The CUDA Samples are not meant for performance "
      "measurements. Results may vary when GPU Boost is enabled.\n");
-  if (correct) {
+    // Execute the kernel
-    return EXIT_SUCCESS;
+    int nIter = 300;
-  } else {
+
-    return EXIT_FAILURE;
+    for (int j = 0; j < nIter; j++) {
-  }
+        if (block_size == 16) {
            MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
        else {
            MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
        }
    }
    // Record the stop event
    checkCudaErrors(cudaEventRecord(stop, stream));
    // Wait for the stop event to complete
    checkCudaErrors(cudaEventSynchronize(stop));
    float msecTotal = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
    // Compute and print the performance
    float  msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul =
        2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
           " WorkgroupSize= %u threads/block\n",
           gigaFlops,
           msecPerMatrixMul,
           flopsPerMatrixMul,
           threads.x * threads.y);
    // Copy result from device to host
    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    printf("Checking computed result for correctness: ");
    bool correct = true;
    // test relative error by the formula
    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
    double eps = 1.e-6; // machine zero
    for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
        double abs_err    = fabs(h_C[i] - (dimsA.x * valB));
        double dot_length = dimsA.x;
        double abs_val    = fabs(h_C[i]);
        double rel_err    = abs_err / abs_val / dot_length;
        if (rel_err > eps) {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
            correct = false;
        }
    }
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
    // Clean up memory
    checkCudaErrors(cudaFreeHost(h_A));
    checkCudaErrors(cudaFreeHost(h_B));
    checkCudaErrors(cudaFreeHost(h_C));
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_C));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    printf("\nNOTE: The CUDA Samples are not meant for performance "
           "measurements. Results may vary when GPU Boost is enabled.\n");
    if (correct) {
        return EXIT_SUCCESS;
    }
    else {
        return EXIT_FAILURE;
    }
 }
 /**
 * Program main
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("[Matrix Multiply Using CUDA] - Starting...\n");
+{
    printf("[Matrix Multiply Using CUDA] - Starting...\n");
-  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
-      checkCmdLineFlag(argc, (const char **)argv, "?")) {
+        printf("Usage -device=n (n >= 0 for deviceID)\n");
-    printf("Usage -device=n (n >= 0 for deviceID)\n");
+        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
-    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+        printf("  Note: Outer matrix dimensions of A & B matrices"
-    printf("  Note: Outer matrix dimensions of A & B matrices" \
+               " must be equal.\n");
           " must be equal.\n");
-    exit(EXIT_SUCCESS);
+        exit(EXIT_SUCCESS);
-  }
+    }
-  // This will pick the best possible CUDA capable device, otherwise
+    // This will pick the best possible CUDA capable device, otherwise
-  // override the device ID based on input provided at the command line
+    // override the device ID based on input provided at the command line
-  int dev = findCudaDevice(argc, (const char **)argv);
+    int dev = findCudaDevice(argc, (const char **)argv);
-  int block_size = 32;
+    int block_size = 32;
-  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
-  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
-  // width of Matrix A
+    // width of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
-    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
-  }
+    }
-  // height of Matrix A
+    // height of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
-    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
-  }
+    }
-  // width of Matrix B
+    // width of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
-    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
-  }
+    }
-  // height of Matrix B
+    // height of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
-    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
-  }
+    }
-  if (dimsA.x != dimsB.y) {
+    if (dimsA.x != dimsB.y) {
-    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
-           dimsA.x, dimsB.y);
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
         dimsB.x, dimsB.y);
-  checkCudaErrors(cudaProfilerStart());
+    checkCudaErrors(cudaProfilerStart());
-  int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
+    int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
-  checkCudaErrors(cudaProfilerStop());
+    checkCudaErrors(cudaProfilerStop());
-  exit(matrix_result);
+    exit(matrix_result);
 }
--- a/Samples/0_Introduction/matrixMulDrv/matrixMul.h
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMul.h
@ -30,11 +30,11 @@
 // Matrix dimensions
 // (chosen as multiples of the thread block size for simplicity)
-#define WA (4 * block_size)  // Matrix A width
+#define WA (4 * block_size) // Matrix A width
-#define HA (6 * block_size)  // Matrix A height
+#define HA (6 * block_size) // Matrix A height
-#define WB (4 * block_size)  // Matrix B width
+#define WB (4 * block_size) // Matrix B width
-#define HB WA                // Matrix B height
+#define HB WA               // Matrix B height
-#define WC WB                // Matrix C width
+#define WC WB               // Matrix C width
-#define HC HA                // Matrix C height
+#define HC HA               // Matrix C height
-#endif  // _MATRIXMUL_H_
+#endif // _MATRIXMUL_H_
--- a/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMulDrv.cpp
@ -46,23 +46,23 @@
 // includes, system
 #include <builtin_types.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <cstring>
 #include <iostream>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, project, CUDA
 #include <cstring>
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
 #include <helper_image.h>
 #include <helper_string.h>
 #include <helper_timer.h>
 #include <cstring>
 #include <iostream>
 #include <string>
 #include "matrixMul.h"
@ -71,11 +71,9 @@
 void runTest(int argc, char **argv);
 void randomInit(float *, int);
-extern "C" void computeGold(float *, const float *, const float *, unsigned int,
+extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
                            unsigned int, unsigned int);
-static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
                    int *blk_size);
 #ifndef FATBIN_FILE
 #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
 ////////////////////////////////////////////////////////////////////////////////
 // Globals
 ////////////////////////////////////////////////////////////////////////////////
-CUdevice cuDevice;
+CUdevice  cuDevice;
 CUcontext cuContext;
-CUmodule cuModule;
+CUmodule  cuModule;
-size_t totalGlobalMem;
+size_t    totalGlobalMem;
 const char *sSDKsample = "matrixMulDrv (Driver API)";
-void constantInit(float *data, int size, float val) {
+void constantInit(float *data, int size, float val)
-  for (int i = 0; i < size; ++i) {
+{
-    data[i] = val;
+    for (int i = 0; i < size; ++i) {
-  }
+        data[i] = val;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("[ %s ]\n", sSDKsample);
+{
    printf("[ %s ]\n", sSDKsample);
-  runTest(argc, argv);
+    runTest(argc, argv);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  // initialize CUDA
+{
-  CUfunction matrixMul = NULL;
+    // initialize CUDA
-  int block_size = 0;
+    CUfunction matrixMul  = NULL;
    int        block_size = 0;
-  initCUDA(argc, argv, &matrixMul, &block_size);
+    initCUDA(argc, argv, &matrixMul, &block_size);
-  // set seed for rand()
+    // set seed for rand()
-  srand(2006);
+    srand(2006);
-  // allocate host memory for matrices A and B
+    // allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
+    unsigned int size_A     = WA * HA;
-  unsigned int mem_size_A = sizeof(float) * size_A;
+    unsigned int mem_size_A = sizeof(float) * size_A;
-  float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
+    float       *h_A        = reinterpret_cast<float *>(malloc(mem_size_A));
-  unsigned int size_B = WB * HB;
+    unsigned int size_B     = WB * HB;
-  unsigned int mem_size_B = sizeof(float) * size_B;
+    unsigned int mem_size_B = sizeof(float) * size_B;
-  float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
+    float       *h_B        = reinterpret_cast<float *>(malloc(mem_size_B));
-  // initialize host memory
+    // initialize host memory
-  const float valB = 0.01f;
+    const float valB = 0.01f;
-  constantInit(h_A, size_A, 1.0f);
+    constantInit(h_A, size_A, 1.0f);
-  constantInit(h_B, size_B, valB);
+    constantInit(h_B, size_B, valB);
-  // allocate device memory
+    // allocate device memory
-  CUdeviceptr d_A;
+    CUdeviceptr d_A;
-  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
+    checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
-  CUdeviceptr d_B;
+    CUdeviceptr d_B;
-  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
+    checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
-  // copy host memory to device
+    // copy host memory to device
-  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
-  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
-  // allocate device memory for result
+    // allocate device memory for result
-  size_t size_C = WC * HC;
+    size_t size_C     = WC * HC;
-  size_t mem_size_C = sizeof(float) * size_C;
+    size_t mem_size_C = sizeof(float) * size_C;
-  CUdeviceptr d_C;
+    CUdeviceptr d_C;
-  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
+    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
-  // allocate mem for the result on host side
+    // allocate mem for the result on host side
-  float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
+    float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
-  // create and start timer
+    // create and start timer
-  StopWatchInterface *timer = NULL;
+    StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  // start the timer
+    // start the timer
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  // There are two ways to launch CUDA kernels via the Driver API.
+    // There are two ways to launch CUDA kernels via the Driver API.
-  // In this CUDA Sample, we illustrate both ways to pass parameters
+    // In this CUDA Sample, we illustrate both ways to pass parameters
-  // and specify parameters.  By default we use the simpler method.
+    // and specify parameters.  By default we use the simpler method.
-  dim3 block(block_size, block_size, 1);
+    dim3 block(block_size, block_size, 1);
-  dim3 grid(WC / block_size, HC / block_size, 1);
+    dim3 grid(WC / block_size, HC / block_size, 1);
-  if (1) {
+    if (1) {
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
-    // Launching (simplier method)
+        // Launching (simplier method)
-    size_t Matrix_Width_A = (size_t)WA;
+        size_t Matrix_Width_A = (size_t)WA;
-    size_t Matrix_Width_B = (size_t)WB;
+        size_t Matrix_Width_B = (size_t)WB;
-    void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
+        void  *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
-    // new CUDA 4.0 Driver API Kernel launch call
+        // new CUDA 4.0 Driver API Kernel launch call
-    checkCudaErrors(cuLaunchKernel(
+        checkCudaErrors(cuLaunchKernel(matrixMul,
-        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
+                                       grid.x,
-        2 * block_size * block_size * sizeof(float), NULL, args, NULL));
+                                       grid.y,
-  } else {
+                                       grid.z,
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+                                       block.x,
-    // Launching (advanced method)
+                                       block.y,
-    int offset = 0;
+                                       block.z,
-    char argBuffer[256];
+                                       2 * block_size * block_size * sizeof(float),
-
+                                       NULL,
-    // pass in launch parameters (not actually de-referencing CUdeviceptr).
+                                       args,
-    // CUdeviceptr is storing the value of the parameters
+                                       NULL));
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
    offset += sizeof(d_C);
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
    offset += sizeof(d_A);
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
    offset += sizeof(d_B);
    size_t Matrix_Width_A = (size_t)WA;
    size_t Matrix_Width_B = (size_t)WB;
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
    offset += sizeof(Matrix_Width_A);
    *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
    offset += sizeof(Matrix_Width_B);
    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
                                     CU_LAUNCH_PARAM_END};
    // new CUDA 4.0 Driver API Kernel launch call
    checkCudaErrors(cuLaunchKernel(
        matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
        2 * block_size * block_size * sizeof(float), NULL, NULL,
        reinterpret_cast<void **>(&kernel_launch_config)));
  }
  // copy result from device to host
  checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
  // stop and destroy timer
  sdkStopTimer(&timer);
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
  sdkDeleteTimer(&timer);
  printf("Checking computed result for correctness: ");
  bool correct = true;
  for (int i = 0; i < static_cast<int>(WC * HC); i++) {
    if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
             h_C[i], WA * valB);
      correct = false;
    }
-  }
+    else {
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
        // Launching (advanced method)
        int  offset = 0;
        char argBuffer[256];
-  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+        // pass in launch parameters (not actually de-referencing CUdeviceptr).
        // CUdeviceptr is storing the value of the parameters
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
        offset += sizeof(d_C);
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
        offset += sizeof(d_A);
        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
        offset += sizeof(d_B);
-  printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
+        size_t Matrix_Width_A = (size_t)WA;
-         "Results may vary when GPU Boost is enabled.\n");
+        size_t Matrix_Width_B = (size_t)WB;
-  // clean up memory
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
-  free(h_A);
+        offset += sizeof(Matrix_Width_A);
-  free(h_B);
+        *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
-  free(h_C);
+        offset += sizeof(Matrix_Width_B);
-  checkCudaErrors(cuMemFree(d_A));
+
-  checkCudaErrors(cuMemFree(d_B));
+        void *kernel_launch_config[5] = {
-  checkCudaErrors(cuMemFree(d_C));
+            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
-  checkCudaErrors(cuCtxDestroy(cuContext));
+
        // new CUDA 4.0 Driver API Kernel launch call
        checkCudaErrors(cuLaunchKernel(matrixMul,
                                       grid.x,
                                       grid.y,
                                       grid.z,
                                       block.x,
                                       block.y,
                                       block.z,
                                       2 * block_size * block_size * sizeof(float),
                                       NULL,
                                       NULL,
                                       reinterpret_cast<void **>(&kernel_launch_config)));
    }
    // copy result from device to host
    checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
    // stop and destroy timer
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);
    printf("Checking computed result for correctness: ");
    bool correct = true;
    for (int i = 0; i < static_cast<int>(WC * HC); i++) {
        if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
            correct = false;
        }
    }
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
           "Results may vary when GPU Boost is enabled.\n");
    // clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    checkCudaErrors(cuMemFree(d_A));
    checkCudaErrors(cuMemFree(d_B));
    checkCudaErrors(cuMemFree(d_C));
    checkCudaErrors(cuCtxDestroy(cuContext));
 }
 // Allocates a matrix with random float entries.
-void randomInit(float *data, int size) {
+void randomInit(float *data, int size)
-  for (int i = 0; i < size; ++i) {
+{
-    data[i] = rand() / static_cast<float>(RAND_MAX);
+    for (int i = 0; i < size; ++i) {
-  }
+        data[i] = rand() / static_cast<float>(RAND_MAX);
 }
 static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
                    int *blk_size) {
  CUfunction cuFunction = 0;
  int major = 0, minor = 0;
  char deviceName[100];
  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
  // get compute capabilities and the devicename
  checkCudaErrors(cuDeviceGetAttribute(
      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
  checkCudaErrors(cuDeviceGetAttribute(
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
  checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
  printf("  Total amount of global memory:     %llu bytes\n",
         (long long unsigned int)totalGlobalMem);
  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
  // first search for the module path before we load the results
  std::string module_path;
  std::ostringstream fatbin;
  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
    exit(EXIT_FAILURE);
  } else {
    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  }
  if (!fatbin.str().size()) {
    printf("fatbin file empty. exiting..\n");
    exit(EXIT_FAILURE);
  }
  // Create module from binary file (FATBIN)
  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
  // select the suitable kernel function
  const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
                           "matrixMul_bs8_64bit"};
  int idx = 0;
  int block_size = 32;
  while (idx < 3) {
    int threadsPerBlock = 0;
    int blocksPerGrid = 0;
    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
    checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
        &blocksPerGrid, &threadsPerBlock, cuFunction, 0,
        2 * block_size * block_size * sizeof(float), 0));
    if (block_size * block_size <= threadsPerBlock) {
      printf("> %d block size selected\n", block_size);
      break;
    } else {
      block_size /= 2;
    }
-    idx++;
+}
-  }
+
-
+static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
-  *pMatrixMul = cuFunction;
+{
-  *blk_size = block_size;
+    CUfunction cuFunction = 0;
-
+    int        major = 0, minor = 0;
-  return 0;
+    char       deviceName[100];
    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
    // get compute capabilities and the devicename
    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
    printf("  Total amount of global memory:     %llu bytes\n", (long long unsigned int)totalGlobalMem);
    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
    // first search for the module path before we load the results
    std::string        module_path;
    std::ostringstream fatbin;
    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
        exit(EXIT_FAILURE);
    }
    else {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
    if (!fatbin.str().size()) {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }
    // Create module from binary file (FATBIN)
    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
    // select the suitable kernel function
    const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
    int idx        = 0;
    int block_size = 32;
    while (idx < 3) {
        int threadsPerBlock = 0;
        int blocksPerGrid   = 0;
        checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
        checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
            &blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
        if (block_size * block_size <= threadsPerBlock) {
            printf("> %d block size selected\n", block_size);
            break;
        }
        else {
            block_size /= 2;
        }
        idx++;
    }
    *pMatrixMul = cuFunction;
    *blk_size   = block_size;
    return 0;
 }
--- a/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMulDrv/matrixMul_kernel.cu
@ -42,86 +42,87 @@
 //! wA is A's width and wB is B's width
 ////////////////////////////////////////////////////////////////////////////////
 template <int block_size, typename size_type>
-__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
+__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
-                          size_type wB) {
+{
-  // Block index
+    // Block index
-  size_type bx = blockIdx.x;
+    size_type bx = blockIdx.x;
-  size_type by = blockIdx.y;
+    size_type by = blockIdx.y;
-  // Thread index
+    // Thread index
-  size_type tx = threadIdx.x;
+    size_type tx = threadIdx.x;
-  size_type ty = threadIdx.y;
+    size_type ty = threadIdx.y;
-  // Index of the first sub-matrix of A processed by the block
+    // Index of the first sub-matrix of A processed by the block
-  size_type aBegin = wA * block_size * by;
+    size_type aBegin = wA * block_size * by;
-  // Index of the last sub-matrix of A processed by the block
+    // Index of the last sub-matrix of A processed by the block
-  size_type aEnd = aBegin + wA - 1;
+    size_type aEnd = aBegin + wA - 1;
-  // Step size used to iterate through the sub-matrices of A
+    // Step size used to iterate through the sub-matrices of A
-  size_type aStep = block_size;
+    size_type aStep = block_size;
-  // Index of the first sub-matrix of B processed by the block
+    // Index of the first sub-matrix of B processed by the block
-  size_type bBegin = block_size * bx;
+    size_type bBegin = block_size * bx;
-  // Step size used to iterate through the sub-matrices of B
+    // Step size used to iterate through the sub-matrices of B
-  size_type bStep = block_size * wB;
+    size_type bStep = block_size * wB;
-  // Csub is used to store the element of the block sub-matrix
+    // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
+    // that is computed by the thread
-  float Csub = 0;
+    float Csub = 0;
-  // Loop over all the sub-matrices of A and B
+    // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
+    // required to compute the block sub-matrix
-  for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-    // Declaration of the shared memory array As used to
+        // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
+        // store the sub-matrix of A
-    __shared__ float As[block_size][block_size];
+        __shared__ float As[block_size][block_size];
-    // Declaration of the shared memory array Bs used to
+        // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
+        // store the sub-matrix of B
-    __shared__ float Bs[block_size][block_size];
+        __shared__ float Bs[block_size][block_size];
-    // Load the matrices from device memory
+        // Load the matrices from device memory
-    // to shared memory; each thread loads
+        // to shared memory; each thread loads
-    // one element of each matrix
+        // one element of each matrix
-    AS(ty, tx) = A[a + wA * ty + tx];
+        AS(ty, tx) = A[a + wA * ty + tx];
-    BS(ty, tx) = B[b + wB * ty + tx];
+        BS(ty, tx) = B[b + wB * ty + tx];
-    // Synchronize to make sure the matrices are loaded
+        // Synchronize to make sure the matrices are loaded
-    __syncthreads();
+        __syncthreads();
-    // Multiply the two matrices together;
+        // Multiply the two matrices together;
-    // each thread computes one element
+        // each thread computes one element
-    // of the block sub-matrix
+        // of the block sub-matrix
 #pragma unroll
-    for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
+        for (size_type k = 0; k < block_size; ++k)
            Csub += AS(ty, k) * BS(k, tx);
-    // Synchronize to make sure that the preceding
+        // Synchronize to make sure that the preceding
-    // computation is done before loading two new
+        // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
+        // sub-matrices of A and B in the next iteration
-    __syncthreads();
+        __syncthreads();
-  }
+    }
-  // Write the block sub-matrix to device memory;
+    // Write the block sub-matrix to device memory;
-  // each thread writes one element
+    // each thread writes one element
-  size_type c = wB * block_size * by + block_size * bx;
+    size_type c         = wB * block_size * by + block_size * bx;
-  C[c + wB * ty + tx] = Csub;
+    C[c + wB * ty + tx] = Csub;
 }
 // C wrappers around our template kernel
-extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
+extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-                                               size_t wA, size_t wB) {
+{
-  matrixMul<8, size_t>(C, A, B, wA, wB);
+    matrixMul<8, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
+extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-                                                size_t wA, size_t wB) {
+{
-  matrixMul<16, size_t>(C, A, B, wA, wB);
+    matrixMul<16, size_t>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
+extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
-                                                size_t wA, size_t wB) {
+{
-  matrixMul<32, size_t>(C, A, B, wA, wB);
+    matrixMul<32, size_t>(C, A, B, wA, wB);
 }
-#endif  // #ifndef _MATRIXMUL_KERNEL_H_
+#endif // #ifndef _MATRIXMUL_KERNEL_H_
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink.c
@ -15,210 +15,211 @@
 // With these flags defined, this source file will dynamically
 // load the corresponding functions.  Disabled by default.
-//#define CUDA_INIT_D3D9
+// #define CUDA_INIT_D3D9
-//#define CUDA_INIT_D3D10
+// #define CUDA_INIT_D3D10
-//#define CUDA_INIT_D3D11
+// #define CUDA_INIT_D3D11
-//#define CUDA_INIT_OPENGL
+// #define CUDA_INIT_OPENGL
 #include <stdio.h>
 #include "cuda_drvapi_dynlink.h"
-tcuInit                               *_cuInit;
+#include <stdio.h>
 tcuDriverGetVersion                   *cuDriverGetVersion;
 tcuDeviceGet                          *cuDeviceGet;
 tcuDeviceGetCount                     *cuDeviceGetCount;
 tcuDeviceGetName                      *cuDeviceGetName;
 tcuDeviceComputeCapability            *cuDeviceComputeCapability;
 tcuDeviceTotalMem                     *cuDeviceTotalMem;
 tcuDeviceGetProperties                *cuDeviceGetProperties;
 tcuDeviceGetAttribute                 *cuDeviceGetAttribute;
 tcuGetErrorString                     *cuGetErrorString;
 tcuCtxCreate                          *cuCtxCreate;
 tcuCtxDestroy                         *cuCtxDestroy;
 tcuCtxAttach                          *cuCtxAttach;
 tcuCtxDetach                          *cuCtxDetach;
 tcuCtxPushCurrent                     *cuCtxPushCurrent;
 tcuCtxPopCurrent                      *cuCtxPopCurrent;
 tcuCtxGetCurrent                      *cuCtxGetCurrent;
 tcuCtxSetCurrent                      *cuCtxSetCurrent;
 tcuCtxGetDevice                       *cuCtxGetDevice;
 tcuCtxSynchronize                     *cuCtxSynchronize;
 tcuModuleLoad                         *cuModuleLoad;
 tcuModuleLoadData                     *cuModuleLoadData;
 tcuModuleLoadDataEx                   *cuModuleLoadDataEx;
 tcuModuleLoadFatBinary                *cuModuleLoadFatBinary;
 tcuModuleUnload                       *cuModuleUnload;
 tcuModuleGetFunction                  *cuModuleGetFunction;
 tcuModuleGetGlobal                    *cuModuleGetGlobal;
 tcuModuleGetTexRef                    *cuModuleGetTexRef;
 tcuModuleGetSurfRef                   *cuModuleGetSurfRef;
 tcuMemGetInfo                         *cuMemGetInfo;
 tcuMemAlloc                           *cuMemAlloc;
 tcuMemAllocPitch                      *cuMemAllocPitch;
 tcuMemFree                            *cuMemFree;
 tcuMemGetAddressRange                 *cuMemGetAddressRange;
 tcuMemAllocHost                       *cuMemAllocHost;
 tcuMemFreeHost                        *cuMemFreeHost;
 tcuMemHostAlloc                       *cuMemHostAlloc;
 tcuMemHostGetFlags                    *cuMemHostGetFlags;
-tcuMemHostGetDevicePointer            *cuMemHostGetDevicePointer;
+tcuInit                    *_cuInit;
-tcuDeviceGetByPCIBusId                *cuDeviceGetByPCIBusId;
+tcuDriverGetVersion        *cuDriverGetVersion;
-tcuDeviceGetPCIBusId                  *cuDeviceGetPCIBusId;
+tcuDeviceGet               *cuDeviceGet;
-tcuIpcGetEventHandle                  *cuIpcGetEventHandle;
+tcuDeviceGetCount          *cuDeviceGetCount;
-tcuIpcOpenEventHandle                 *cuIpcOpenEventHandle;
+tcuDeviceGetName           *cuDeviceGetName;
-tcuIpcGetMemHandle                    *cuIpcGetMemHandle;
+tcuDeviceComputeCapability *cuDeviceComputeCapability;
-tcuIpcOpenMemHandle                   *cuIpcOpenMemHandle;
+tcuDeviceTotalMem          *cuDeviceTotalMem;
-tcuIpcCloseMemHandle                  *cuIpcCloseMemHandle;
+tcuDeviceGetProperties     *cuDeviceGetProperties;
 tcuDeviceGetAttribute      *cuDeviceGetAttribute;
 tcuGetErrorString          *cuGetErrorString;
 tcuCtxCreate               *cuCtxCreate;
 tcuCtxDestroy              *cuCtxDestroy;
 tcuCtxAttach               *cuCtxAttach;
 tcuCtxDetach               *cuCtxDetach;
 tcuCtxPushCurrent          *cuCtxPushCurrent;
 tcuCtxPopCurrent           *cuCtxPopCurrent;
 tcuCtxGetCurrent           *cuCtxGetCurrent;
 tcuCtxSetCurrent           *cuCtxSetCurrent;
 tcuCtxGetDevice            *cuCtxGetDevice;
 tcuCtxSynchronize          *cuCtxSynchronize;
 tcuModuleLoad              *cuModuleLoad;
 tcuModuleLoadData          *cuModuleLoadData;
 tcuModuleLoadDataEx        *cuModuleLoadDataEx;
 tcuModuleLoadFatBinary     *cuModuleLoadFatBinary;
 tcuModuleUnload            *cuModuleUnload;
 tcuModuleGetFunction       *cuModuleGetFunction;
 tcuModuleGetGlobal         *cuModuleGetGlobal;
 tcuModuleGetTexRef         *cuModuleGetTexRef;
 tcuModuleGetSurfRef        *cuModuleGetSurfRef;
 tcuMemGetInfo              *cuMemGetInfo;
 tcuMemAlloc                *cuMemAlloc;
 tcuMemAllocPitch           *cuMemAllocPitch;
 tcuMemFree                 *cuMemFree;
 tcuMemGetAddressRange      *cuMemGetAddressRange;
 tcuMemAllocHost            *cuMemAllocHost;
 tcuMemFreeHost             *cuMemFreeHost;
 tcuMemHostAlloc            *cuMemHostAlloc;
 tcuMemHostGetFlags         *cuMemHostGetFlags;
-tcuMemHostRegister                    *cuMemHostRegister;
+tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
-tcuMemHostUnregister                  *cuMemHostUnregister;
+tcuDeviceGetByPCIBusId     *cuDeviceGetByPCIBusId;
-tcuMemcpyHtoD                         *cuMemcpyHtoD;
+tcuDeviceGetPCIBusId       *cuDeviceGetPCIBusId;
-tcuMemcpyDtoH                         *cuMemcpyDtoH;
+tcuIpcGetEventHandle       *cuIpcGetEventHandle;
-tcuMemcpyDtoD                         *cuMemcpyDtoD;
+tcuIpcOpenEventHandle      *cuIpcOpenEventHandle;
-tcuMemcpyDtoA                         *cuMemcpyDtoA;
+tcuIpcGetMemHandle         *cuIpcGetMemHandle;
-tcuMemcpyAtoD                         *cuMemcpyAtoD;
+tcuIpcOpenMemHandle        *cuIpcOpenMemHandle;
-tcuMemcpyHtoA                         *cuMemcpyHtoA;
+tcuIpcCloseMemHandle       *cuIpcCloseMemHandle;
 tcuMemcpyAtoH                         *cuMemcpyAtoH;
 tcuMemcpyAtoA                         *cuMemcpyAtoA;
 tcuMemcpy2D                           *cuMemcpy2D;
 tcuMemcpy2DUnaligned                  *cuMemcpy2DUnaligned;
 tcuMemcpy3D                           *cuMemcpy3D;
 tcuMemcpyHtoDAsync                    *cuMemcpyHtoDAsync;
 tcuMemcpyDtoHAsync                    *cuMemcpyDtoHAsync;
 tcuMemcpyDtoDAsync                    *cuMemcpyDtoDAsync;
 tcuMemcpyHtoAAsync                    *cuMemcpyHtoAAsync;
 tcuMemcpyAtoHAsync                    *cuMemcpyAtoHAsync;
 tcuMemcpy2DAsync                      *cuMemcpy2DAsync;
 tcuMemcpy3DAsync                      *cuMemcpy3DAsync;
 tcuMemcpy                             *cuMemcpy;
 tcuMemcpyPeer                         *cuMemcpyPeer;
 tcuMemsetD8                           *cuMemsetD8;
 tcuMemsetD16                          *cuMemsetD16;
 tcuMemsetD32                          *cuMemsetD32;
 tcuMemsetD2D8                         *cuMemsetD2D8;
 tcuMemsetD2D16                        *cuMemsetD2D16;
 tcuMemsetD2D32                        *cuMemsetD2D32;
 tcuFuncSetBlockShape                  *cuFuncSetBlockShape;
 tcuFuncSetSharedSize                  *cuFuncSetSharedSize;
 tcuFuncGetAttribute                   *cuFuncGetAttribute;
 tcuFuncSetCacheConfig                 *cuFuncSetCacheConfig;
 tcuFuncSetSharedMemConfig             *cuFuncSetSharedMemConfig;
 tcuLaunchKernel                       *cuLaunchKernel;
 tcuArrayCreate                        *cuArrayCreate;
 tcuArrayGetDescriptor                 *cuArrayGetDescriptor;
 tcuArrayDestroy                       *cuArrayDestroy;
 tcuArray3DCreate                      *cuArray3DCreate;
 tcuArray3DGetDescriptor               *cuArray3DGetDescriptor;
 tcuTexRefCreate                       *cuTexRefCreate;
 tcuTexRefDestroy                      *cuTexRefDestroy;
 tcuTexRefSetArray                     *cuTexRefSetArray;
 tcuTexRefSetAddress                   *cuTexRefSetAddress;
 tcuTexRefSetAddress2D                 *cuTexRefSetAddress2D;
 tcuTexRefSetFormat                    *cuTexRefSetFormat;
 tcuTexRefSetAddressMode               *cuTexRefSetAddressMode;
 tcuTexRefSetFilterMode                *cuTexRefSetFilterMode;
 tcuTexRefSetFlags                     *cuTexRefSetFlags;
 tcuTexRefGetAddress                   *cuTexRefGetAddress;
 tcuTexRefGetArray                     *cuTexRefGetArray;
 tcuTexRefGetAddressMode               *cuTexRefGetAddressMode;
 tcuTexRefGetFilterMode                *cuTexRefGetFilterMode;
 tcuTexRefGetFormat                    *cuTexRefGetFormat;
 tcuTexRefGetFlags                     *cuTexRefGetFlags;
 tcuSurfRefSetArray                    *cuSurfRefSetArray;
 tcuSurfRefGetArray                    *cuSurfRefGetArray;
 tcuParamSetSize                       *cuParamSetSize;
 tcuParamSeti                          *cuParamSeti;
 tcuParamSetf                          *cuParamSetf;
 tcuParamSetv                          *cuParamSetv;
 tcuParamSetTexRef                     *cuParamSetTexRef;
 tcuLaunch                             *cuLaunch;
 tcuLaunchGrid                         *cuLaunchGrid;
 tcuLaunchGridAsync                    *cuLaunchGridAsync;
 tcuEventCreate                        *cuEventCreate;
 tcuEventRecord                        *cuEventRecord;
 tcuEventQuery                         *cuEventQuery;
 tcuEventSynchronize                   *cuEventSynchronize;
 tcuEventDestroy                       *cuEventDestroy;
 tcuEventElapsedTime                   *cuEventElapsedTime;
 tcuStreamCreate                       *cuStreamCreate;
 tcuStreamWaitEvent                    *cuStreamWaitEvent;
 tcuStreamAddCallback                  *cuStreamAddCallback;
 tcuStreamQuery                        *cuStreamQuery;
 tcuStreamSynchronize                  *cuStreamSynchronize;
 tcuStreamDestroy                      *cuStreamDestroy;
 tcuGraphicsUnregisterResource         *cuGraphicsUnregisterResource;
 tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
 tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
 tcuGraphicsResourceSetMapFlags        *cuGraphicsResourceSetMapFlags;
 tcuGraphicsMapResources               *cuGraphicsMapResources;
 tcuGraphicsUnmapResources             *cuGraphicsUnmapResources;
 tcuGetExportTable                     *cuGetExportTable;
 tcuCtxSetLimit                        *cuCtxSetLimit;
 tcuCtxGetLimit                        *cuCtxGetLimit;
 tcuCtxGetCacheConfig                  *cuCtxGetCacheConfig;
 tcuCtxSetCacheConfig                  *cuCtxSetCacheConfig;
 tcuCtxGetSharedMemConfig              *cuCtxGetSharedMemConfig;
 tcuCtxSetSharedMemConfig              *cuCtxSetSharedMemConfig;
 tcuCtxGetApiVersion                   *cuCtxGetApiVersion;
-tcuMipmappedArrayCreate               *cuMipmappedArrayCreate;
+tcuMemHostRegister                   *cuMemHostRegister;
-tcuMipmappedArrayGetLevel             *cuMipmappedArrayGetLevel;
+tcuMemHostUnregister                 *cuMemHostUnregister;
-tcuMipmappedArrayDestroy              *cuMipmappedArrayDestroy;
+tcuMemcpyHtoD                        *cuMemcpyHtoD;
 tcuMemcpyDtoH                        *cuMemcpyDtoH;
 tcuMemcpyDtoD                        *cuMemcpyDtoD;
 tcuMemcpyDtoA                        *cuMemcpyDtoA;
 tcuMemcpyAtoD                        *cuMemcpyAtoD;
 tcuMemcpyHtoA                        *cuMemcpyHtoA;
 tcuMemcpyAtoH                        *cuMemcpyAtoH;
 tcuMemcpyAtoA                        *cuMemcpyAtoA;
 tcuMemcpy2D                          *cuMemcpy2D;
 tcuMemcpy2DUnaligned                 *cuMemcpy2DUnaligned;
 tcuMemcpy3D                          *cuMemcpy3D;
 tcuMemcpyHtoDAsync                   *cuMemcpyHtoDAsync;
 tcuMemcpyDtoHAsync                   *cuMemcpyDtoHAsync;
 tcuMemcpyDtoDAsync                   *cuMemcpyDtoDAsync;
 tcuMemcpyHtoAAsync                   *cuMemcpyHtoAAsync;
 tcuMemcpyAtoHAsync                   *cuMemcpyAtoHAsync;
 tcuMemcpy2DAsync                     *cuMemcpy2DAsync;
 tcuMemcpy3DAsync                     *cuMemcpy3DAsync;
 tcuMemcpy                            *cuMemcpy;
 tcuMemcpyPeer                        *cuMemcpyPeer;
 tcuMemsetD8                          *cuMemsetD8;
 tcuMemsetD16                         *cuMemsetD16;
 tcuMemsetD32                         *cuMemsetD32;
 tcuMemsetD2D8                        *cuMemsetD2D8;
 tcuMemsetD2D16                       *cuMemsetD2D16;
 tcuMemsetD2D32                       *cuMemsetD2D32;
 tcuFuncSetBlockShape                 *cuFuncSetBlockShape;
 tcuFuncSetSharedSize                 *cuFuncSetSharedSize;
 tcuFuncGetAttribute                  *cuFuncGetAttribute;
 tcuFuncSetCacheConfig                *cuFuncSetCacheConfig;
 tcuFuncSetSharedMemConfig            *cuFuncSetSharedMemConfig;
 tcuLaunchKernel                      *cuLaunchKernel;
 tcuArrayCreate                       *cuArrayCreate;
 tcuArrayGetDescriptor                *cuArrayGetDescriptor;
 tcuArrayDestroy                      *cuArrayDestroy;
 tcuArray3DCreate                     *cuArray3DCreate;
 tcuArray3DGetDescriptor              *cuArray3DGetDescriptor;
 tcuTexRefCreate                      *cuTexRefCreate;
 tcuTexRefDestroy                     *cuTexRefDestroy;
 tcuTexRefSetArray                    *cuTexRefSetArray;
 tcuTexRefSetAddress                  *cuTexRefSetAddress;
 tcuTexRefSetAddress2D                *cuTexRefSetAddress2D;
 tcuTexRefSetFormat                   *cuTexRefSetFormat;
 tcuTexRefSetAddressMode              *cuTexRefSetAddressMode;
 tcuTexRefSetFilterMode               *cuTexRefSetFilterMode;
 tcuTexRefSetFlags                    *cuTexRefSetFlags;
 tcuTexRefGetAddress                  *cuTexRefGetAddress;
 tcuTexRefGetArray                    *cuTexRefGetArray;
 tcuTexRefGetAddressMode              *cuTexRefGetAddressMode;
 tcuTexRefGetFilterMode               *cuTexRefGetFilterMode;
 tcuTexRefGetFormat                   *cuTexRefGetFormat;
 tcuTexRefGetFlags                    *cuTexRefGetFlags;
 tcuSurfRefSetArray                   *cuSurfRefSetArray;
 tcuSurfRefGetArray                   *cuSurfRefGetArray;
 tcuParamSetSize                      *cuParamSetSize;
 tcuParamSeti                         *cuParamSeti;
 tcuParamSetf                         *cuParamSetf;
 tcuParamSetv                         *cuParamSetv;
 tcuParamSetTexRef                    *cuParamSetTexRef;
 tcuLaunch                            *cuLaunch;
 tcuLaunchGrid                        *cuLaunchGrid;
 tcuLaunchGridAsync                   *cuLaunchGridAsync;
 tcuEventCreate                       *cuEventCreate;
 tcuEventRecord                       *cuEventRecord;
 tcuEventQuery                        *cuEventQuery;
 tcuEventSynchronize                  *cuEventSynchronize;
 tcuEventDestroy                      *cuEventDestroy;
 tcuEventElapsedTime                  *cuEventElapsedTime;
 tcuStreamCreate                      *cuStreamCreate;
 tcuStreamWaitEvent                   *cuStreamWaitEvent;
 tcuStreamAddCallback                 *cuStreamAddCallback;
 tcuStreamQuery                       *cuStreamQuery;
 tcuStreamSynchronize                 *cuStreamSynchronize;
 tcuStreamDestroy                     *cuStreamDestroy;
 tcuGraphicsUnregisterResource        *cuGraphicsUnregisterResource;
 tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
 tcuGraphicsResourceGetMappedPointer  *cuGraphicsResourceGetMappedPointer;
 tcuGraphicsResourceSetMapFlags       *cuGraphicsResourceSetMapFlags;
 tcuGraphicsMapResources              *cuGraphicsMapResources;
 tcuGraphicsUnmapResources            *cuGraphicsUnmapResources;
 tcuGetExportTable                    *cuGetExportTable;
 tcuCtxSetLimit                       *cuCtxSetLimit;
 tcuCtxGetLimit                       *cuCtxGetLimit;
 tcuCtxGetCacheConfig                 *cuCtxGetCacheConfig;
 tcuCtxSetCacheConfig                 *cuCtxSetCacheConfig;
 tcuCtxGetSharedMemConfig             *cuCtxGetSharedMemConfig;
 tcuCtxSetSharedMemConfig             *cuCtxSetSharedMemConfig;
 tcuCtxGetApiVersion                  *cuCtxGetApiVersion;
-tcuProfilerStop                       *cuProfilerStop;
+tcuMipmappedArrayCreate   *cuMipmappedArrayCreate;
 tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
 tcuMipmappedArrayDestroy  *cuMipmappedArrayDestroy;
 tcuProfilerStop *cuProfilerStop;
 #ifdef CUDA_INIT_D3D9
 // D3D9/CUDA interop (CUDA 1.x compatible API). These functions
 // are deprecated; please use the ones below
-tcuD3D9Begin                          *cuD3D9Begin;
+tcuD3D9Begin                  *cuD3D9Begin;
-tcuD3D9End                            *cuD3DEnd;
+tcuD3D9End                    *cuD3DEnd;
-tcuD3D9RegisterVertexBuffer           *cuD3D9RegisterVertexBuffer;
+tcuD3D9RegisterVertexBuffer   *cuD3D9RegisterVertexBuffer;
-tcuD3D9MapVertexBuffer                *cuD3D9MapVertexBuffer;
+tcuD3D9MapVertexBuffer        *cuD3D9MapVertexBuffer;
-tcuD3D9UnmapVertexBuffer              *cuD3D9UnmapVertexBuffer;
+tcuD3D9UnmapVertexBuffer      *cuD3D9UnmapVertexBuffer;
-tcuD3D9UnregisterVertexBuffer         *cuD3D9UnregisterVertexBuffer;
+tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
 // D3D9/CUDA interop (CUDA 2.x compatible)
-tcuD3D9GetDirect3DDevice              *cuD3D9GetDirect3DDevice;
+tcuD3D9GetDirect3DDevice            *cuD3D9GetDirect3DDevice;
-tcuD3D9RegisterResource               *cuD3D9RegisterResource;
+tcuD3D9RegisterResource             *cuD3D9RegisterResource;
-tcuD3D9UnregisterResource             *cuD3D9UnregisterResource;
+tcuD3D9UnregisterResource           *cuD3D9UnregisterResource;
-tcuD3D9MapResources                   *cuD3D9MapResources;
+tcuD3D9MapResources                 *cuD3D9MapResources;
-tcuD3D9UnmapResources                 *cuD3D9UnmapResources;
+tcuD3D9UnmapResources               *cuD3D9UnmapResources;
-tcuD3D9ResourceSetMapFlags            *cuD3D9ResourceSetMapFlags;
+tcuD3D9ResourceSetMapFlags          *cuD3D9ResourceSetMapFlags;
-tcuD3D9ResourceGetSurfaceDimensions   *cuD3D9ResourceGetSurfaceDimensions;
+tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
-tcuD3D9ResourceGetMappedArray         *cuD3D9ResourceGetMappedArray;
+tcuD3D9ResourceGetMappedArray       *cuD3D9ResourceGetMappedArray;
-tcuD3D9ResourceGetMappedPointer       *cuD3D9ResourceGetMappedPointer;
+tcuD3D9ResourceGetMappedPointer     *cuD3D9ResourceGetMappedPointer;
-tcuD3D9ResourceGetMappedSize          *cuD3D9ResourceGetMappedSize;
+tcuD3D9ResourceGetMappedSize        *cuD3D9ResourceGetMappedSize;
-tcuD3D9ResourceGetMappedPitch         *cuD3D9ResourceGetMappedPitch;
+tcuD3D9ResourceGetMappedPitch       *cuD3D9ResourceGetMappedPitch;
 // D3D9/CUDA interop (CUDA 2.0+)
-tcuD3D9GetDevice                      *cuD3D9GetDevice;
+tcuD3D9GetDevice                *cuD3D9GetDevice;
-tcuD3D9CtxCreate                      *cuD3D9CtxCreate;
+tcuD3D9CtxCreate                *cuD3D9CtxCreate;
-tcuGraphicsD3D9RegisterResource       *cuGraphicsD3D9RegisterResource;
+tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
 #endif
 #ifdef CUDA_INIT_D3D10
 // D3D10/CUDA interop (CUDA 3.0+)
-tcuD3D10GetDevice                     *cuD3D10GetDevice;
+tcuD3D10GetDevice                *cuD3D10GetDevice;
-tcuD3D10CtxCreate                     *cuD3D10CtxCreate;
+tcuD3D10CtxCreate                *cuD3D10CtxCreate;
-tcuGraphicsD3D10RegisterResource      *cuGraphicsD3D10RegisterResource;
+tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
 #endif
 #ifdef CUDA_INIT_D3D11
 // D3D11/CUDA interop (CUDA 3.0+)
-tcuD3D11GetDevice                     *cuD3D11GetDevice;
+tcuD3D11GetDevice                *cuD3D11GetDevice;
-tcuD3D11CtxCreate                     *cuD3D11CtxCreate;
+tcuD3D11CtxCreate                *cuD3D11CtxCreate;
-tcuGraphicsD3D11RegisterResource      *cuGraphicsD3D11RegisterResource;
+tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
 #endif
 // GL/CUDA interop
 #ifdef CUDA_INIT_OPENGL
-tcuGLCtxCreate                        *cuGLCtxCreate;
+tcuGLCtxCreate              *cuGLCtxCreate;
-tcuGraphicsGLRegisterBuffer           *cuGraphicsGLRegisterBuffer;
+tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
-tcuGraphicsGLRegisterImage            *cuGraphicsGLRegisterImage;
+tcuGraphicsGLRegisterImage  *cuGraphicsGLRegisterImage;
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-tcuWGLGetDevice                       *cuWGLGetDevice;
+tcuWGLGetDevice *cuWGLGetDevice;
 #endif
 #endif
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = LoadLibrary(__CudaLibName);
-    if (*pInstance == NULL)
+    if (*pInstance == NULL) {
    {
        printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
    return CUDA_SUCCESS;
 }
-#define GET_PROC_EX(name, alias, required)                     \
+#define GET_PROC_EX(name, alias, required)                                               \
-    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);               \
+    alias = (t##name *)GetProcAddress(CudaDrvLib, #name);                                \
-    if (alias == NULL && required) {                                    \
+    if (alias == NULL && required) {                                                     \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
-               #name, __CudaLibName);                                  \
+        return CUDA_ERROR_UNKNOWN;                                                       \
        return CUDA_ERROR_UNKNOWN;                                      \
    }
-#define GET_PROC_EX_V2(name, alias, required)                           \
+#define GET_PROC_EX_V2(name, alias, required)                                                           \
-    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
+    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));                                \
-    if (alias == NULL && required) {                                    \
+    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
-               STRINGIFY(name##_v2), __CudaLibName);                       \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
        return CUDA_ERROR_UNKNOWN;                                      \
    }
-#define GET_PROC_EX_V3(name, alias, required)                           \
+#define GET_PROC_EX_V3(name, alias, required)                                                           \
-    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
+    alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));                                \
-    if (alias == NULL && required) {                                    \
+    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
-               STRINGIFY(name##_v3), __CudaLibName);                       \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
        return CUDA_ERROR_UNKNOWN;                                      \
    }
-#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
+#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
 #include <dlfcn.h>
 #if defined(__APPLE__) || defined(__MACOSX)
 static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
 #elif defined(__ANDROID__)
-#if defined (__aarch64__)
+#if defined(__aarch64__)
 static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
 #elif defined(__arm__)
 static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
 {
    *pInstance = dlopen(__CudaLibName, RTLD_NOW);
-    if (*pInstance == NULL)
+    if (*pInstance == NULL) {
    {
        printf("dlopen \"%s\" failed!\n", __CudaLibName);
        return CUDA_ERROR_UNKNOWN;
    }
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
    return CUDA_SUCCESS;
 }
-#define GET_PROC_EX(name, alias, required)                              \
+#define GET_PROC_EX(name, alias, required)                                               \
-    alias = (t##name *)dlsym(CudaDrvLib, #name);                        \
+    alias = (t##name *)dlsym(CudaDrvLib, #name);                                         \
-    if (alias == NULL && required) {                                    \
+    if (alias == NULL && required) {                                                     \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
-               #name, __CudaLibName);                                  \
+        return CUDA_ERROR_UNKNOWN;                                                       \
        return CUDA_ERROR_UNKNOWN;                                      \
    }
-#define GET_PROC_EX_V2(name, alias, required)                           \
+#define GET_PROC_EX_V2(name, alias, required)                                                           \
-    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));         \
+    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2));                                         \
-    if (alias == NULL && required) {                                    \
+    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
-               STRINGIFY(name##_v2), __CudaLibName);                    \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
        return CUDA_ERROR_UNKNOWN;                                      \
    }
-#define GET_PROC_EX_V3(name, alias, required)                           \
+#define GET_PROC_EX_V3(name, alias, required)                                                           \
-    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));         \
+    alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3));                                         \
-    if (alias == NULL && required) {                                    \
+    if (alias == NULL && required) {                                                                    \
-        printf("Failed to find required function \"%s\" in %s\n",       \
+        printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
-               STRINGIFY(name##_v3), __CudaLibName);                    \
+        return CUDA_ERROR_UNKNOWN;                                                                      \
        return CUDA_ERROR_UNKNOWN;                                      \
    }
 #else
 #error unsupported platform
 #endif
-#define CHECKED_CALL(call)              \
+#define CHECKED_CALL(call)            \
-    do {                                \
+    do {                              \
-        CUresult result = (call);       \
+        CUresult result = (call);     \
-        if (CUDA_SUCCESS != result) {   \
+        if (CUDA_SUCCESS != result) { \
-            return result;              \
+            return result;            \
-        }                               \
+        }                             \
-    } while(0)
+    } while (0)
-#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
+#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
-#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
+#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
 #define GET_PROC(name)          GET_PROC_REQUIRED(name)
-#define GET_PROC_V2(name)       GET_PROC_EX_V2(name,name,1)
+#define GET_PROC_V2(name)       GET_PROC_EX_V2(name, name, 1)
-#define GET_PROC_V3(name)       GET_PROC_EX_V3(name,name,1)
+#define GET_PROC_V3(name)       GET_PROC_EX_V3(name, name, 1)
 CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 {
    CUDADRIVER CudaDrvLib;
-    int driverVer = 1000;
+    int        driverVer = 1000;
    CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    // available since 2.2. if not present, version 1.0 is assumed
    GET_PROC_OPTIONAL(cuDriverGetVersion);
-    if (cuDriverGetVersion)
+    if (cuDriverGetVersion) {
    {
        CHECKED_CALL(cuDriverGetVersion(&driverVer));
    }
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    GET_PROC(cuStreamDestroy);
    // These are CUDA 5.0 new functions
-    if (driverVer >= 5000)
+    if (driverVer >= 5000) {
    {
        GET_PROC(cuMipmappedArrayCreate);
        GET_PROC(cuMipmappedArrayDestroy);
        GET_PROC(cuMipmappedArrayGetLevel);
    }
    // These are CUDA 4.2 new functions
-    if (driverVer >= 4020)
+    if (driverVer >= 4020) {
    {
        GET_PROC(cuFuncSetSharedMemConfig);
        GET_PROC(cuCtxGetSharedMemConfig);
        GET_PROC(cuCtxSetSharedMemConfig);
    }
    // These are CUDA 4.1 new functions
-    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
+    if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
    {
        GET_PROC(cuDeviceGetByPCIBusId);
        GET_PROC(cuDeviceGetPCIBusId);
        GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }
    // These could be _v2 interfaces
-    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
+    if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
    {
        GET_PROC_V2(cuCtxDestroy);
        GET_PROC_V2(cuCtxPopCurrent);
        GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuEventDestroy);
    }
-    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
+    if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
    {
        GET_PROC_V2(cuDeviceTotalMem);
        GET_PROC_V2(cuCtxCreate);
        GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC_V2(cuTexRefSetAddress);
        GET_PROC_V2(cuTexRefGetAddress);
-        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
+        if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
        {
            GET_PROC_V3(cuTexRefSetAddress2D);
        }
-        else
+        else {
        {
            GET_PROC_V2(cuTexRefSetAddress2D);
        }
    }
-    else
+    else {
    {
        // versions earlier than 3020
        GET_PROC(cuDeviceTotalMem);
        GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
    }
    // The following functions are specific to CUDA versions
-    if (driverVer >= 4000)
+    if (driverVer >= 4000) {
    {
        GET_PROC(cuCtxSetCurrent);
        GET_PROC(cuCtxGetCurrent);
        GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuProfilerStop);
    }
-    if (driverVer >= 3010)
+    if (driverVer >= 3010) {
    {
        GET_PROC(cuModuleGetSurfRef);
        GET_PROC(cuSurfRefSetArray);
        GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuCtxGetLimit);
    }
-    if (driverVer >= 3000)
+    if (driverVer >= 3000) {
    {
        GET_PROC(cuMemcpyDtoDAsync);
        GET_PROC(cuFuncSetCacheConfig);
 #ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGraphicsUnregisterResource);
        GET_PROC(cuGraphicsSubResourceGetMappedArray);
-        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
+        if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
        {
            GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
        }
-        else
+        else {
        {
            GET_PROC(cuGraphicsResourceGetMappedPointer);
        }
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
        GET_PROC(cuGetExportTable);
    }
-    if (driverVer >= 2030)
+    if (driverVer >= 2030) {
    {
        GET_PROC(cuMemHostGetFlags);
 #ifdef CUDA_INIT_D3D10
        GET_PROC(cuD3D10GetDevice);
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
 #endif
    }
-    if (driverVer >= 2010)
+    if (driverVer >= 2010) {
    {
        GET_PROC(cuModuleLoadDataEx);
        GET_PROC(cuModuleLoadFatBinary);
 #ifdef CUDA_INIT_OPENGL
        GET_PROC(cuGLCtxCreate);
        GET_PROC(cuGraphicsGLRegisterBuffer);
        GET_PROC(cuGraphicsGLRegisterImage);
-#  ifdef WIN32
+#ifdef WIN32
        GET_PROC(cuWGLGetDevice);
-#  endif
+#endif
 #endif
 #ifdef CUDA_INIT_D3D9
        GET_PROC(cuD3D9GetDevice);
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/cuda_drvapi_dynlink_cuda.h
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/helper_cuda_drvapi.h
@ -14,21 +14,17 @@
 #ifndef HELPER_CUDA_DRVAPI_H
 #define HELPER_CUDA_DRVAPI_H
 #include <helper_string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <helper_string.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif
 #ifndef HELPER_CUDA_DRVAPI_H
-inline int ftoi(float value) {
+inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
  return (value >= 0 ? static_cast<int>(value + 0.5)
                     : static_cast<int>(value - 0.5));
 }
 #endif
 #ifndef EXIT_WAIVED
@ -47,311 +43,302 @@ inline int ftoi(float value) {
 #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
 // These are the inline versions for all of the SDK helper functions
-inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
+inline void __checkCudaErrors(CUresult err, const char *file, const int line)
-  if (CUDA_SUCCESS != err) {
+{
-    const char *errorStr = NULL;
+    if (CUDA_SUCCESS != err) {
-    cuGetErrorString(err, &errorStr);
+        const char *errorStr = NULL;
-    fprintf(stderr,
+        cuGetErrorString(err, &errorStr);
-            "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
+        fprintf(stderr,
-            "line %i.\n",
+                "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
-            err, errorStr, file, line);
+                "line %i.\n",
-    exit(EXIT_FAILURE);
+                err,
-  }
+                errorStr,
                file,
                line);
        exit(EXIT_FAILURE);
    }
 }
 #endif
 // This function wraps the CUDA Driver API into a template function
-template <class T>
+template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+{
-                             int device) {
+    checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
  checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
 }
 #endif
 // Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2CoresDRV(int major, int minor) {
+inline int _ConvertSMVer2CoresDRV(int major, int minor)
-  // Defines for GPU Architecture types (using the SM version to determine the #
+{
-  // of cores per SM
+    // Defines for GPU Architecture types (using the SM version to determine the #
-  typedef struct {
+    // of cores per SM
-    int SM;  // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
+    typedef struct
-             // minor version
+    {
-    int Cores;
+        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
-  } sSMtoCores;
+                // minor version
        int Cores;
    } sSMtoCores;
-  sSMtoCores nGpuArchCoresPerSM[] = {
+    sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
-      {0x30, 192},
+                                       {0x32, 192},
-      {0x32, 192},
+                                       {0x35, 192},
-      {0x35, 192},
+                                       {0x37, 192},
-      {0x37, 192},
+                                       {0x50, 128},
-      {0x50, 128},
+                                       {0x52, 128},
-      {0x52, 128},
+                                       {0x53, 128},
-      {0x53, 128},
+                                       {0x60, 64},
-      {0x60,  64},
+                                       {0x61, 128},
-      {0x61, 128},
+                                       {0x62, 128},
-      {0x62, 128},
+                                       {0x70, 64},
-      {0x70,  64},
+                                       {0x72, 64},
-      {0x72,  64},
+                                       {0x75, 64},
-      {0x75,  64},
+                                       {0x80, 64},
-      {0x80,  64},
+                                       {0x86, 128},
-      {0x86, 128},
+                                       {0x87, 128},
-      {0x87, 128},
+                                       {0x90, 128},
-      {0x90, 128},
+                                       {-1, -1}};
      {-1, -1}};
-  int index = 0;
+    int index = 0;
-  while (nGpuArchCoresPerSM[index].SM != -1) {
+    while (nGpuArchCoresPerSM[index].SM != -1) {
-    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchCoresPerSM[index].Cores;
+            return nGpuArchCoresPerSM[index].Cores;
        }
        index++;
    }
-    index++;
+    // If we don't find the values, we default use the previous one to run
-  }
+    // properly
-
+    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
-  // If we don't find the values, we default use the previous one to run
+           major,
-  // properly
+           minor,
-  printf(
+           nGpuArchCoresPerSM[index - 1].Cores);
-      "MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n",
+    return nGpuArchCoresPerSM[index - 1].Cores;
      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
  return nGpuArchCoresPerSM[index - 1].Cores;
 }
-  // end of GPU Architecture definitions
+// end of GPU Architecture definitions
 #ifdef __cuda_cuda_h__
 // General GPU Device CUDA Initialization
-inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
+inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
-  int cuDevice = 0;
+{
-  int deviceCount = 0;
+    int cuDevice    = 0;
-  checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
+    int deviceCount = 0;
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
-  checkCudaErrors(cuDeviceGetCount(&deviceCount));
+    checkCudaErrors(cuDeviceGetCount(&deviceCount));
-  if (deviceCount == 0) {
+    if (deviceCount == 0) {
-    fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
+        fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  int dev = 0;
+    int dev = 0;
-  dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
+    dev     = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
-  if (dev < 0) {
+    if (dev < 0) {
-    dev = 0;
+        dev = 0;
-  }
+    }
-  if (dev > deviceCount - 1) {
+    if (dev > deviceCount - 1) {
-    fprintf(stderr, "\n");
+        fprintf(stderr, "\n");
-    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
-            deviceCount);
+        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
-    fprintf(stderr,
+        fprintf(stderr, "\n");
-            ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
+        return -dev;
-            dev);
+    }
    fprintf(stderr, "\n");
    return -dev;
  }
-  checkCudaErrors(cuDeviceGet(&cuDevice, dev));
+    checkCudaErrors(cuDeviceGet(&cuDevice, dev));
-  char name[100];
+    char name[100];
-  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
-  int computeMode;
+    int computeMode;
-  getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
+    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
-  if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
+    if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
-    fprintf(stderr,
+        fprintf(stderr,
-            "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
+                "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
-            "threads can use this CUDA Device.\n");
+                "threads can use this CUDA Device.\n");
-    return -1;
+        return -1;
-  }
+    }
-  if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
+    if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
-    printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
+        printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
-  }
+    }
-  return dev;
+    return dev;
 }
 // This function returns the best GPU based on performance
-inline int gpuGetMaxGflopsDeviceIdDRV() {
+inline int gpuGetMaxGflopsDeviceIdDRV()
-  CUdevice current_device = 0;
+{
-  CUdevice max_perf_device = 0;
+    CUdevice           current_device   = 0;
-  int device_count = 0;
+    CUdevice           max_perf_device  = 0;
-  int sm_per_multiproc = 0;
+    int                device_count     = 0;
-  unsigned long long max_compute_perf = 0;
+    int                sm_per_multiproc = 0;
-  int major = 0;
+    unsigned long long max_compute_perf = 0;
-  int minor = 0;
+    int                major            = 0;
-  int multiProcessorCount;
+    int                minor            = 0;
-  int clockRate;
+    int                multiProcessorCount;
-  int devices_prohibited = 0;
+    int                clockRate;
    int                devices_prohibited = 0;
-  cuInit(0, __CUDA_API_VERSION);
+    cuInit(0, __CUDA_API_VERSION);
-  checkCudaErrors(cuDeviceGetCount(&device_count));
+    checkCudaErrors(cuDeviceGetCount(&device_count));
-  if (device_count == 0) {
+    if (device_count == 0) {
-    fprintf(stderr,
+        fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
-            "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
+        exit(EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }
  // Find the best CUDA capable GPU device
  current_device = 0;
  while (current_device < device_count) {
    checkCudaErrors(cuDeviceGetAttribute(
        &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
        current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
    int computeMode;
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
                          current_device);
    if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
      if (major == 9999 && minor == 9999) {
        sm_per_multiproc = 1;
      } else {
        sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
      }
      unsigned long long compute_perf =
          (unsigned long long)(multiProcessorCount * sm_per_multiproc *
                               clockRate);
      if (compute_perf > max_compute_perf) {
          max_compute_perf = compute_perf;
          max_perf_device = current_device;
      }
    } else {
      devices_prohibited++;
    }
-    ++current_device;
+    // Find the best CUDA capable GPU device
-  }
+    current_device = 0;
-  if (devices_prohibited == device_count) {
+    while (current_device < device_count) {
-    fprintf(stderr,
+        checkCudaErrors(
-            "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
+            cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
-            "prohibited.\n");
+        checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
-    exit(EXIT_FAILURE);
+        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
-  }
+        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
-  return max_perf_device;
+        int computeMode;
        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
        if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
            if (major == 9999 && minor == 9999) {
                sm_per_multiproc = 1;
            }
            else {
                sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
            }
            unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
            if (compute_perf > max_compute_perf) {
                max_compute_perf = compute_perf;
                max_perf_device  = current_device;
            }
        }
        else {
            devices_prohibited++;
        }
        ++current_device;
    }
    if (devices_prohibited == device_count) {
        fprintf(stderr,
                "gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
                "prohibited.\n");
        exit(EXIT_FAILURE);
    }
    return max_perf_device;
 }
 // General initialization call to pick the best CUDA Device
-inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
+inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
-  CUdevice cuDevice;
+{
-  int devID = 0;
+    CUdevice cuDevice;
    int      devID = 0;
-  // If the command-line has a device number specified, use it
+    // If the command-line has a device number specified, use it
-  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
-    devID = gpuDeviceInitDRV(argc, argv);
+        devID = gpuDeviceInitDRV(argc, argv);
-    if (devID < 0) {
+        if (devID < 0) {
-      printf("exiting...\n");
+            printf("exiting...\n");
-      exit(EXIT_SUCCESS);
+            exit(EXIT_SUCCESS);
        }
    }
    else {
        // Otherwise pick the device with highest Gflops/s
        char name[100];
        devID = gpuGetMaxGflopsDeviceIdDRV();
        checkCudaErrors(cuDeviceGet(&cuDevice, devID));
        cuDeviceGetName(name, 100, cuDevice);
        printf("> Using CUDA Device [%d]: %s\n", devID, name);
    }
  } else {
    // Otherwise pick the device with highest Gflops/s
    char name[100];
    devID = gpuGetMaxGflopsDeviceIdDRV();
    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
    cuDeviceGetName(name, 100, cuDevice);
    printf("> Using CUDA Device [%d]: %s\n", devID, name);
  }
-  cuDeviceGet(&cuDevice, devID);
+    cuDeviceGet(&cuDevice, devID);
-  return cuDevice;
+    return cuDevice;
 }
-inline CUdevice findIntegratedGPUDrv() {
+inline CUdevice findIntegratedGPUDrv()
-  CUdevice current_device = 0;
+{
-  int device_count = 0;
+    CUdevice current_device     = 0;
-  int devices_prohibited = 0;
+    int      device_count       = 0;
-  int isIntegrated;
+    int      devices_prohibited = 0;
    int      isIntegrated;
-  cuInit(0, __CUDA_API_VERSION);
+    cuInit(0, __CUDA_API_VERSION);
-  checkCudaErrors(cuDeviceGetCount(&device_count));
+    checkCudaErrors(cuDeviceGetCount(&device_count));
-  if (device_count == 0) {
+    if (device_count == 0) {
-    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  // Find the integrated GPU which is compute capable
  while (current_device < device_count) {
    int computeMode = -1;
    checkCudaErrors(cuDeviceGetAttribute(
        &isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
    checkCudaErrors(cuDeviceGetAttribute(
        &computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
    // If GPU is integrated and is not running on Compute Mode prohibited use
    // that
    if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
      int major = 0, minor = 0;
      char deviceName[256];
      checkCudaErrors(cuDeviceGetAttribute(
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
          current_device));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
          current_device));
      checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
             current_device, deviceName, major, minor);
      return current_device;
    } else {
      devices_prohibited++;
    }
-    current_device++;
+    // Find the integrated GPU which is compute capable
-  }
+    while (current_device < device_count) {
        int computeMode = -1;
        checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
        checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
-  if (devices_prohibited == device_count) {
+        // If GPU is integrated and is not running on Compute Mode prohibited use
-    fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
+        // that
-    exit(EXIT_FAILURE);
+        if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
-  }
+            int  major = 0, minor = 0;
            char deviceName[256];
            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
            checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
-  return -1;
+            return current_device;
        }
        else {
            devices_prohibited++;
        }
        current_device++;
    }
    if (devices_prohibited == device_count) {
        fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
        exit(EXIT_FAILURE);
    }
    return -1;
 }
 // General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
+inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
-                                     int devID) {
+{
-  CUdevice cuDevice;
+    CUdevice cuDevice;
-  char name[256];
+    char     name[256];
-  int major = 0, minor = 0;
+    int      major = 0, minor = 0;
-  checkCudaErrors(cuDeviceGet(&cuDevice, devID));
+    checkCudaErrors(cuDeviceGet(&cuDevice, devID));
-  checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
+    checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
  checkCudaErrors(cuDeviceGetAttribute(
      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
-  if ((major > major_version) ||
+    if ((major > major_version) || (major == major_version && minor >= minor_version)) {
-      (major == major_version && minor >= minor_version)) {
+        printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
-    printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
+        return true;
-           major, minor);
+    }
-    return true;
+    else {
-  } else {
+        printf("No GPU device was found that can support CUDA compute capability "
-    printf(
+               "%d.%d.\n",
-        "No GPU device was found that can support CUDA compute capability "
+               major_version,
-        "%d.%d.\n",
+               minor_version);
-        major_version, minor_version);
+        return false;
-    return false;
+    }
  }
 }
 #endif
-  // end of CUDA Helper Functions
+// end of CUDA Helper Functions
 #endif  // HELPER_CUDA_DRVAPI_H
 #endif // HELPER_CUDA_DRVAPI_H
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul.h
@ -34,8 +34,8 @@
 #define WA (4 * block_size) // Matrix A width
 #define HA (6 * block_size) // Matrix A height
 #define WB (4 * block_size) // Matrix B width
-#define HB WA  // Matrix B height
+#define HB WA               // Matrix B height
-#define WC WB  // Matrix C width 
+#define WC WB               // Matrix C width
-#define HC HA  // Matrix C height
+#define HC HA               // Matrix C height
 #endif // _MATRIXMUL_H_
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMulDynlinkJIT.cpp
@ -43,10 +43,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, CUDA
 #include "cuda_drvapi_dynlink.h"
@ -60,7 +60,7 @@
 extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 #if defined _MSC_VER
-#pragma warning (disable : 4312)
+#pragma warning(disable : 4312)
 #endif
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
 // Globals
 ////////////////////////////////////////////////////////////////////////////////
 CUcontext g_cuContext;
-bool noprompt = false;
+bool      noprompt = false;
 static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
 ////////////////////////////////////////////////////////////////////////////////
 void randomInit(float *data, size_t size)
 {
-    for (size_t i = 0; i < size; ++i)
+    for (size_t i = 0; i < size; ++i) {
    {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
 ////////////////////////////////////////////////////////////////////////////////
 CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
 {
-    CUresult status;
+    CUresult   status;
-    CUdevice cuDevice;
+    CUdevice   cuDevice;
-    CUmodule cuModule;
+    CUmodule   cuModule;
    CUfunction cuFunction;
-    int major, minor, block_size, devID = 0;
+    int        major, minor, block_size, devID = 0;
-    char deviceName[256];
+    char       deviceName[256];
    // link to cuda driver dynamically
    checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
    // This assumes that the user is attempting to specify a explicit device -device=n
-    if (argc > 1)
+    if (argc > 1) {
    {
        bool bFound = false;
-        for (int param=0; param < argc; param++)
+        for (int param = 0; param < argc; param++) {
-        {
+            if (!strncmp(argv[param], "-device", 7)) {
-            if (!strncmp(argv[param], "-device", 7))
+                int i = (int)strlen(argv[1]);
            {
                int i=(int)strlen(argv[1]);
-                while (argv[1][i] != '=')
+                while (argv[1][i] != '=') {
                {
                    i--;
                }
-                devID = atoi(&argv[1][++i]);
+                devID  = atoi(&argv[1][++i]);
                bFound = true;
            }
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    int deviceCount = 0;
    checkCudaErrors(cuDeviceGetCount(&deviceCount));
-    if (deviceCount == 0)
+    if (deviceCount == 0) {
    {
        fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
        exit(EXIT_SUCCESS);
    }
-    if (devID < 0) devID = 0;
+    if (devID < 0)
        devID = 0;
-    if (devID > deviceCount -1)
+    if (devID > deviceCount - 1) {
    {
        fprintf(stderr, "initCUDA (Device=%d) invalid GPU device.  %d GPU device(s) detected.\n\n", devID, deviceCount);
        status = CUDA_ERROR_NOT_FOUND;
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
    printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
-    block_size = 32;
+    block_size      = 32;
    *block_size_out = block_size;
    // create context for picked device
    status = cuCtxCreate(&g_cuContext, 0, cuDevice);
-    if (CUDA_SUCCESS != status)
+    if (CUDA_SUCCESS != status) {
    {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_SUCCESS);
    }
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
    {
        // in this branch we use compilation with parameters
        const unsigned int jitNumOptions = 3;
-        CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
+        CUjit_option      *jitOptions    = new CUjit_option[jitNumOptions];
-        void **jitOptVals = new void *[jitNumOptions];
+        void             **jitOptVals    = new void *[jitNumOptions];
        // set up size of compilation log buffer
-        jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+        jitOptions[0]        = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
        int jitLogBufferSize = 1024;
-        jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
+        jitOptVals[0]        = (void *)(size_t)jitLogBufferSize;
        // set up pointer to the compilation log buffer
-        jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+        jitOptions[1]      = CU_JIT_INFO_LOG_BUFFER;
        char *jitLogBuffer = new char[jitLogBufferSize];
-        jitOptVals[1] = jitLogBuffer;
+        jitOptVals[1]      = jitLogBuffer;
        // set up pointer to set the Maximum # of registers for a particular kernel
-        jitOptions[2] = CU_JIT_MAX_REGISTERS;
+        jitOptions[2]   = CU_JIT_MAX_REGISTERS;
        int jitRegCount = 32;
-        jitOptVals[2] = (void *)(size_t)jitRegCount;
+        jitOptVals[2]   = (void *)(size_t)jitRegCount;
        // compile with set parameters
        printf("> Compiling CUDA module\n");
 #if defined(_WIN64) || defined(__LP64__)
-        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status =
            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #else
-        status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
+        status =
            cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
 #endif
        printf("> PTX JIT log:\n%s\n", jitLogBuffer);
-        delete [] jitOptions;
+        delete[] jitOptions;
-        delete [] jitOptVals;
+        delete[] jitOptVals;
-        delete [] jitLogBuffer;
+        delete[] jitLogBuffer;
    }
-    if (CUDA_SUCCESS != status)
+    if (CUDA_SUCCESS != status) {
    {
        printf("Error while compiling PTX\n");
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }
    // retrieve CUDA function from the compiled module
-    status = cuModuleGetFunction(&cuFunction, cuModule,
+    status = cuModuleGetFunction(
-                                 (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
+        &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
-    if (CUDA_SUCCESS != status)
+    if (CUDA_SUCCESS != status) {
    {
        cuCtxDestroy(g_cuContext);
        exit(EXIT_FAILURE);
    }
@ -233,21 +226,21 @@ int main(int argc, char **argv)
    printf("[ %s ]\n", sSDKsample);
    // initialize CUDA
-    CUfunction matrixMul = NULL;
+    CUfunction matrixMul  = NULL;
-    int block_size = 0;
+    int        block_size = 0;
    checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
    // set seed for rand()
    srand(2006);
    // allocate host memory for matrices A and B
-    size_t       size_A = WA * HA;
+    size_t size_A     = WA * HA;
-    size_t       mem_size_A = sizeof(float) * size_A;
+    size_t mem_size_A = sizeof(float) * size_A;
-    size_t       size_B = WB * HB;
+    size_t size_B     = WB * HB;
-    size_t       mem_size_B = sizeof(float) * size_B;
+    size_t mem_size_B = sizeof(float) * size_B;
-    float *h_A = (float *) malloc(mem_size_A);
+    float *h_A = (float *)malloc(mem_size_A);
-    float *h_B = (float *) malloc(mem_size_B);
+    float *h_B = (float *)malloc(mem_size_B);
    // initialize host memory
    randomInit(h_A, size_A);
@ -264,26 +257,24 @@ int main(int argc, char **argv)
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
    // allocate device memory for result
-    size_t       size_C = WC * HC;
+    size_t size_C     = WC * HC;
-    size_t       mem_size_C = sizeof(float) * size_C;
+    size_t mem_size_C = sizeof(float) * size_C;
    CUdeviceptr d_C;
    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
    // allocate mem for the result on host side
-    float *h_C = (float *) malloc(mem_size_C);
+    float *h_C = (float *)malloc(mem_size_C);
 #if __CUDA_API_VERSION >= 4000
    {
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
-        int Matrix_Width_A = WA;
+        int   Matrix_Width_A = WA;
-        int Matrix_Width_B = WB;
+        int   Matrix_Width_B = WB;
-        void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
+        void *args[5]        = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
-        checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
+        checkCudaErrors(cuLaunchKernel(
-                                       block_size     , block_size     , 1,
+            matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
                                       0,
                                       NULL, args, NULL));
    }
 #else // __CUDA_API_VERSION <= 3020
    {
@ -312,7 +303,7 @@ int main(int argc, char **argv)
        checkCudaErrors(cuParamSetSize(matrixMul, offset));
        checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
-        checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
+        checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
        // set execution configuration for the CUDA kernel
        checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
@ -322,19 +313,18 @@ int main(int argc, char **argv)
    checkCudaErrors(cuCtxSynchronize());
    // copy result from device to host
-    checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
+    checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
    // compute reference solution
-    float *reference = (float *) malloc(mem_size_C);
+    float *reference = (float *)malloc(mem_size_C);
    computeGold(reference, h_A, h_B, HA, WA, WB);
    // check result
-    float diff=0.0f;
+    float diff = 0.0f;
-    for (unsigned int i=0; i<size_C; i++)
+    for (unsigned int i = 0; i < size_C; i++) {
    {
        float tmp = reference[i] - h_C[i];
-        diff += tmp*tmp;
+        diff += tmp * tmp;
    }
    int res = (diff / (float)size_C < 1e-6f);
@ -349,7 +339,7 @@ int main(int argc, char **argv)
    checkCudaErrors(cuMemFree(d_C));
    checkCudaErrors(cuCtxDestroy(g_cuContext));
-    printf("Test run %s\n", (1==res) ? "success!" : "failed!");
+    printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
    exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_gold.cpp
@ -28,8 +28,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // export C interface
-extern "C"
+extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
 //! @param hA         height of matrix A
 //! @param wB         width of matrix B
 ////////////////////////////////////////////////////////////////////////////////
-void
+void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
 {
    for (unsigned int i = 0; i < hA; ++i)
-        for (unsigned int j = 0; j < wB; ++j)
+        for (unsigned int j = 0; j < wB; ++j) {
        {
            double sum = 0;
-            for (unsigned int k = 0; k < wA; ++k)
+            for (unsigned int k = 0; k < wA; ++k) {
            {
                double a = A[i * wA + k];
                double b = B[k * wB + j];
                sum += a * b;
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_32_ptxdump.h
@ -32,7 +32,8 @@
 #define __matrixMul_kernel_32_ptxdump_h__
 #if defined __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
    extern unsigned char matrixMul_kernel_32_ptxdump[25784];
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.c
--- a/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
+++ b/Samples/0_Introduction/matrixMulDynlinkJIT/matrixMul_kernel_64_ptxdump.h
@ -32,7 +32,8 @@
 #define __matrixMul_kernel_64_ptxdump_h__
 #if defined __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
    extern unsigned char matrixMul_kernel_64_ptxdump[26489];
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul.cpp
@ -42,207 +42,208 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 #include "nvrtc_helper.h"
 // Helper functions and utilities to work with CUDA
 #include <helper_functions.h>
-void constantInit(float *data, int size, float val) {
+void constantInit(float *data, int size, float val)
-  for (int i = 0; i < size; ++i) {
+{
-    data[i] = val;
+    for (int i = 0; i < size; ++i) {
-  }
+        data[i] = val;
    }
 }
 /**
 * Run a simple test of matrix multiplication using CUDA
 */
-int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
+int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
-                   dim3 &dimsB) {
+{
-  // Allocate host memory for matrices A and B
+    // Allocate host memory for matrices A and B
-  unsigned int size_A = dimsA.x * dimsA.y;
+    unsigned int size_A     = dimsA.x * dimsA.y;
-  unsigned int mem_size_A = sizeof(float) * size_A;
+    unsigned int mem_size_A = sizeof(float) * size_A;
-  float *h_A = (float *)malloc(mem_size_A);
+    float       *h_A        = (float *)malloc(mem_size_A);
-  unsigned int size_B = dimsB.x * dimsB.y;
+    unsigned int size_B     = dimsB.x * dimsB.y;
-  unsigned int mem_size_B = sizeof(float) * size_B;
+    unsigned int mem_size_B = sizeof(float) * size_B;
-  float *h_B = (float *)malloc(mem_size_B);
+    float       *h_B        = (float *)malloc(mem_size_B);
-  // Initialize host memory
+    // Initialize host memory
-  const float valB = 0.01f;
+    const float valB = 0.01f;
-  constantInit(h_A, size_A, 1.0f);
+    constantInit(h_A, size_A, 1.0f);
-  constantInit(h_B, size_B, valB);
+    constantInit(h_B, size_B, valB);
-  // Allocate device memory
+    // Allocate device memory
-  CUdeviceptr d_A, d_B, d_C;
+    CUdeviceptr d_A, d_B, d_C;
-  char *cubin, *kernel_file;
+    char  *cubin, *kernel_file;
-  size_t cubinSize;
+    size_t cubinSize;
-  kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
+    kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    CUmodule module = loadCUBIN(cubin, argc, argv);
-  // Allocate host matrix C
+    // Allocate host matrix C
-  dim3 dimsC(dimsB.x, dimsA.y, 1);
+    dim3         dimsC(dimsB.x, dimsA.y, 1);
-  unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
+    unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
-  float *h_C = (float *)malloc(mem_size_C);
+    float       *h_C        = (float *)malloc(mem_size_C);
-  if (h_C == NULL) {
+    if (h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host matrix C!\n");
+        fprintf(stderr, "Failed to allocate host matrix C!\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
  checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
  checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
  // copy host memory to device
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
  // Setup execution parameters
  dim3 threads(block_size, block_size);
  dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
  // Create and start timer
  printf("Computing result using CUDA Kernel...\n");
  CUfunction kernel_addr;
  if (block_size == 16) {
    checkCudaErrors(
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
  } else {
    checkCudaErrors(
        cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
  }
  void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
                 (void *)&dimsB.x};
  // Execute the kernel
  int nIter = 300;
  for (int j = 0; j < nIter; j++) {
    checkCudaErrors(
        cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
                       threads.x, threads.y, threads.z,     /* block dim */
                       0, 0,    /* shared mem, stream */
                       &arr[0], /* arguments */
                       0));
    checkCudaErrors(cuCtxSynchronize());
  }
  // Copy result from device to host
  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
  printf("Checking computed result for correctness: ");
  bool correct = true;
  // test relative error by the formula
  //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
  double eps = 1.e-6;  // machine zero
  for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
    double abs_err = fabs(h_C[i] - (dimsA.x * valB));
    double dot_length = dimsA.x;
    double abs_val = fabs(h_C[i]);
    double rel_err = abs_err / abs_val / dot_length;
    if (rel_err > eps) {
      printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
             h_C[i], dimsA.x * valB, eps);
      correct = false;
    }
  }
-  printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
+    checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
    checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
    checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
-  printf(
+    // copy host memory to device
-      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
-      "Results may vary when GPU Boost is enabled.\n");
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
-  // Clean up memory
+    // Setup execution parameters
-  free(h_A);
+    dim3 threads(block_size, block_size);
-  free(h_B);
+    dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
  free(h_C);
-  checkCudaErrors(cuMemFree(d_A));
+    // Create and start timer
-  checkCudaErrors(cuMemFree(d_B));
+    printf("Computing result using CUDA Kernel...\n");
  checkCudaErrors(cuMemFree(d_C));
-  if (correct) {
+    CUfunction kernel_addr;
-    return EXIT_SUCCESS;
+    if (block_size == 16) {
-  } else {
+        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
-    return EXIT_FAILURE;
+    }
-  }
+    else {
        checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
    }
    void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
    // Execute the kernel
    int nIter = 300;
    for (int j = 0; j < nIter; j++) {
        checkCudaErrors(cuLaunchKernel(kernel_addr,
                                       grid.x,
                                       grid.y,
                                       grid.z, /* grid dim */
                                       threads.x,
                                       threads.y,
                                       threads.z, /* block dim */
                                       0,
                                       0,       /* shared mem, stream */
                                       &arr[0], /* arguments */
                                       0));
        checkCudaErrors(cuCtxSynchronize());
    }
    // Copy result from device to host
    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
    printf("Checking computed result for correctness: ");
    bool correct = true;
    // test relative error by the formula
    //     |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|>  < eps
    double eps = 1.e-6; // machine zero
    for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
        double abs_err    = fabs(h_C[i] - (dimsA.x * valB));
        double dot_length = dimsA.x;
        double abs_val    = fabs(h_C[i]);
        double rel_err    = abs_err / abs_val / dot_length;
        if (rel_err > eps) {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
            correct = false;
        }
    }
    printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
    printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
           "Results may vary when GPU Boost is enabled.\n");
    // Clean up memory
    free(h_A);
    free(h_B);
    free(h_C);
    checkCudaErrors(cuMemFree(d_A));
    checkCudaErrors(cuMemFree(d_B));
    checkCudaErrors(cuMemFree(d_C));
    if (correct) {
        return EXIT_SUCCESS;
    }
    else {
        return EXIT_FAILURE;
    }
 }
 /**
 * Program main
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("[Matrix Multiply Using CUDA] - Starting...\n");
+{
    printf("[Matrix Multiply Using CUDA] - Starting...\n");
-  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+    if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
-      checkCmdLineFlag(argc, (const char **)argv, "?")) {
+        printf("Usage -device=n (n >= 0 for deviceID)\n");
-    printf("Usage -device=n (n >= 0 for deviceID)\n");
+        printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
-    printf("      -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
+        printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
-    printf("      -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
+        printf("  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
    printf(
        "  Note: Outer matrix dimensions of A & B matrices must be equal.\n");
-    exit(EXIT_SUCCESS);
+        exit(EXIT_SUCCESS);
-  }
+    }
-  int block_size = 32;
+    int block_size = 32;
-  // original:
+    // original:
-  dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
-  dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
+    dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
-  // reduce sizes to avoid running out of memory
+    // reduce sizes to avoid running out of memory
-  // dim3 dimsA(32,32, 1);
+    // dim3 dimsA(32,32, 1);
-  // dim3 dimsB(32,32,1);
+    // dim3 dimsB(32,32,1);
-  // width of Matrix A
+    // width of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
-    dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
+        dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
-  }
+    }
-  // height of Matrix A
+    // height of Matrix A
-  if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
-    dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
+        dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
-  }
+    }
-  // width of Matrix B
+    // width of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
-    dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
+        dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
-  }
+    }
-  // height of Matrix B
+    // height of Matrix B
-  if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
-    dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
+        dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
-  }
+    }
-  if (dimsA.x != dimsB.y) {
+    if (dimsA.x != dimsB.y) {
-    printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
+        printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
-           dimsA.x, dimsB.y);
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
+    printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
         dimsB.y);
-  int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
+    int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
-  exit(matrix_result);
+    exit(matrix_result);
 }
--- a/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
+++ b/Samples/0_Introduction/matrixMul_nvrtc/matrixMul_kernel.cu
@ -48,84 +48,83 @@
 #include <cooperative_groups.h>
-template <int BLOCK_SIZE>
+template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
-__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
+{
-  // Handle to thread block group
+    // Handle to thread block group
-  cooperative_groups::thread_block cta =
+    cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
-      cooperative_groups::this_thread_block();
+    // Block index
-  // Block index
+    int bx = blockIdx.x;
-  int bx = blockIdx.x;
+    int by = blockIdx.y;
  int by = blockIdx.y;
-  // Thread index
+    // Thread index
-  int tx = threadIdx.x;
+    int tx = threadIdx.x;
-  int ty = threadIdx.y;
+    int ty = threadIdx.y;
-  // Index of the first sub-matrix of A processed by the block
+    // Index of the first sub-matrix of A processed by the block
-  int aBegin = wA * BLOCK_SIZE * by;
+    int aBegin = wA * BLOCK_SIZE * by;
-  // Index of the last sub-matrix of A processed by the block
+    // Index of the last sub-matrix of A processed by the block
-  int aEnd = aBegin + wA - 1;
+    int aEnd = aBegin + wA - 1;
-  // Step size used to iterate through the sub-matrices of A
+    // Step size used to iterate through the sub-matrices of A
-  int aStep = BLOCK_SIZE;
+    int aStep = BLOCK_SIZE;
-  // Index of the first sub-matrix of B processed by the block
+    // Index of the first sub-matrix of B processed by the block
-  int bBegin = BLOCK_SIZE * bx;
+    int bBegin = BLOCK_SIZE * bx;
-  // Step size used to iterate through the sub-matrices of B
+    // Step size used to iterate through the sub-matrices of B
-  int bStep = BLOCK_SIZE * wB;
+    int bStep = BLOCK_SIZE * wB;
-  // Csub is used to store the element of the block sub-matrix
+    // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
+    // that is computed by the thread
-  float Csub = 0;
+    float Csub = 0;
-  // Loop over all the sub-matrices of A and B
+    // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
+    // required to compute the block sub-matrix
-  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
-    // Declaration of the shared memory array As used to
+        // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
+        // store the sub-matrix of A
-    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
-    // Declaration of the shared memory array Bs used to
+        // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
+        // store the sub-matrix of B
-    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
-    // Load the matrices from device memory
+        // Load the matrices from device memory
-    // to shared memory; each thread loads
+        // to shared memory; each thread loads
-    // one element of each matrix
+        // one element of each matrix
-    As[ty][tx] = A[a + wA * ty + tx];
+        As[ty][tx] = A[a + wA * ty + tx];
-    Bs[ty][tx] = B[b + wB * ty + tx];
+        Bs[ty][tx] = B[b + wB * ty + tx];
-    // Synchronize to make sure the matrices are loaded
+        // Synchronize to make sure the matrices are loaded
-    cooperative_groups::sync(cta);
+        cooperative_groups::sync(cta);
 // Multiply the two matrices together;
 // each thread computes one element
 // of the block sub-matrix
 #pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
+            Csub += As[ty][k] * Bs[k][tx];
        }
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        cooperative_groups::sync(cta);
    }
-    // Synchronize to make sure that the preceding
+    // Write the block sub-matrix to device memory;
-    // computation is done before loading two new
+    // each thread writes one element
-    // sub-matrices of A and B in the next iteration
+    int c               = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
-    cooperative_groups::sync(cta);
+    C[c + wB * ty + tx] = Csub;
  }
  // Write the block sub-matrix to device memory;
  // each thread writes one element
  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
  C[c + wB * ty + tx] = Csub;
 }
-extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
+extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
-                                                 int wA, int wB) {
+{
-  matrixMulCUDA<16>(C, A, B, wA, wB);
+    matrixMulCUDA<16>(C, A, B, wA, wB);
 }
-extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
+extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
-                                                 int wA, int wB) {
+{
-  matrixMulCUDA<32>(C, A, B, wA, wB);
+    matrixMulCUDA<32>(C, A, B, wA, wB);
 }
--- a/Samples/0_Introduction/mergeSort/bitonic.cu
+++ b/Samples/0_Introduction/mergeSort/bitonic.cu
@ -28,252 +28,254 @@
 #include <cooperative_groups.h>
 namespace cg = cooperative_groups;
 #include <helper_cuda.h>
 #include <assert.h>
 #include <helper_cuda.h>
 #include "mergeSort_common.h"
-inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
+inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
-                                  uint &valB, uint arrowDir) {
+{
-  uint t;
+    uint t;
-  if ((keyA > keyB) == arrowDir) {
+    if ((keyA > keyB) == arrowDir) {
-    t = keyA;
+        t    = keyA;
-    keyA = keyB;
+        keyA = keyB;
-    keyB = t;
+        keyB = t;
-    t = valA;
+        t    = valA;
-    valA = valB;
+        valA = valB;
-    valB = t;
+        valB = t;
-  }
+    }
 }
-__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
+__global__ void
-                                        uint *d_SrcKey, uint *d_SrcVal,
+bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
-                                        uint arrayLength, uint sortDir) {
+{
-  // Handle to thread block group
+    // Handle to thread block group
-  cg::thread_block cta = cg::this_thread_block();
+    cg::thread_block cta = cg::this_thread_block();
-  // Shared memory storage for one or more short vectors
+    // Shared memory storage for one or more short vectors
-  __shared__ uint s_key[SHARED_SIZE_LIMIT];
+    __shared__ uint s_key[SHARED_SIZE_LIMIT];
-  __shared__ uint s_val[SHARED_SIZE_LIMIT];
+    __shared__ uint s_val[SHARED_SIZE_LIMIT];
-  // Offset to the beginning of subbatch and load data
+    // Offset to the beginning of subbatch and load data
-  d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  s_key[threadIdx.x + 0] = d_SrcKey[0];
+    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
-  s_val[threadIdx.x + 0] = d_SrcVal[0];
+    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
-  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
-  for (uint size = 2; size < arrayLength; size <<= 1) {
+    for (uint size = 2; size < arrayLength; size <<= 1) {
-    // Bitonic merge
+        // Bitonic merge
-    uint dir = (threadIdx.x & (size / 2)) != 0;
+        uint dir = (threadIdx.x & (size / 2)) != 0;
-    for (uint stride = size / 2; stride > 0; stride >>= 1) {
+        for (uint stride = size / 2; stride > 0; stride >>= 1) {
-      cg::sync(cta);
+            cg::sync(cta);
-      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
+            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
-                 s_val[pos + stride], dir);
+        }
    }
  }
-  // ddd == sortDir for the last bitonic merge step
+    // ddd == sortDir for the last bitonic merge step
-  {
+    {
-    for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
+        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
-      cg::sync(cta);
+            cg::sync(cta);
-      uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-      Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
+            Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
-                 s_val[pos + stride], sortDir);
+        }
    }
  }
-  cg::sync(cta);
+    cg::sync(cta);
-  d_DstKey[0] = s_key[threadIdx.x + 0];
+    d_DstKey[0]                       = s_key[threadIdx.x + 0];
-  d_DstVal[0] = s_val[threadIdx.x + 0];
+    d_DstVal[0]                       = s_val[threadIdx.x + 0];
-  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
+    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
 // Helper function (also used by odd-even merge sort)
-extern "C" uint factorRadix2(uint *log2L, uint L) {
+extern "C" uint factorRadix2(uint *log2L, uint L)
-  if (!L) {
+{
-    *log2L = 0;
+    if (!L) {
-    return 0;
+        *log2L = 0;
-  } else {
+        return 0;
-    for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
+    }
-      ;
+    else {
        for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
            ;
-    return L;
+        return L;
-  }
+    }
 }
-extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
+extern "C" void bitonicSortShared(uint *d_DstKey,
-                                  uint *d_SrcKey, uint *d_SrcVal,
+                                  uint *d_DstVal,
-                                  uint batchSize, uint arrayLength,
+                                  uint *d_SrcKey,
-                                  uint sortDir) {
+                                  uint *d_SrcVal,
-  // Nothing to sort
+                                  uint  batchSize,
-  if (arrayLength < 2) {
+                                  uint  arrayLength,
-    return;
+                                  uint  sortDir)
-  }
+{
    // Nothing to sort
    if (arrayLength < 2) {
        return;
    }
-  // Only power-of-two array lengths are supported by this implementation
+    // Only power-of-two array lengths are supported by this implementation
-  uint log2L;
+    uint log2L;
-  uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
+    uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
-  assert(factorizationRemainder == 1);
+    assert(factorizationRemainder == 1);
-  uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
+    uint blockCount  = batchSize * arrayLength / SHARED_SIZE_LIMIT;
-  uint threadCount = SHARED_SIZE_LIMIT / 2;
+    uint threadCount = SHARED_SIZE_LIMIT / 2;
-  assert(arrayLength <= SHARED_SIZE_LIMIT);
+    assert(arrayLength <= SHARED_SIZE_LIMIT);
-  assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
+    assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
-  bitonicSortSharedKernel<<<blockCount, threadCount>>>(
+    bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
-      d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
+    getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
  getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) {
+static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
  return ((a % b) == 0) ? (a / b) : (a / b + 1);
 }
-static inline __host__ __device__ uint getSampleCount(uint dividend) {
+static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
-  return iDivUp(dividend, SAMPLE_STRIDE);
+
 template <uint sortDir>
 static inline __device__ void
 ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
 {
    uint t;
    if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
        || ((arrowDir != sortDir) && (flagB == 1))) {
        t     = keyA;
        keyA  = keyB;
        keyB  = t;
        t     = valA;
        valA  = valB;
        valB  = t;
        t     = flagA;
        flagA = flagB;
        flagB = t;
    }
 }
 template <uint sortDir>
-static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
+__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
-                                                 uint &flagA, uint &keyB,
+                                                      uint *d_DstVal,
-                                                 uint &valB, uint &flagB,
+                                                      uint *d_SrcKey,
-                                                 uint arrowDir) {
+                                                      uint *d_SrcVal,
-  uint t;
+                                                      uint *d_LimitsA,
                                                      uint *d_LimitsB,
                                                      uint  stride,
                                                      uint  N)
 {
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
    __shared__ uint  s_val[2 * SAMPLE_STRIDE];
    __shared__ uint  s_inf[2 * SAMPLE_STRIDE];
-  if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
+    const uint intervalI   = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
-      ((arrowDir == sortDir) && (flagA == 1)) ||
+    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
-      ((arrowDir != sortDir) && (flagB == 1))) {
+    d_SrcKey += segmentBase;
-    t = keyA;
+    d_SrcVal += segmentBase;
-    keyA = keyB;
+    d_DstKey += segmentBase;
-    keyB = t;
+    d_DstVal += segmentBase;
    t = valA;
    valA = valB;
    valB = t;
    t = flagA;
    flagA = flagB;
    flagB = t;
  }
 }
-template <uint sortDir>
+    // Set up threadblock-wide parameters
-__global__ void bitonicMergeElementaryIntervalsKernel(
+    __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
    uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
    uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
  // Handle to thread block group
  cg::thread_block cta = cg::this_thread_block();
  __shared__ uint s_key[2 * SAMPLE_STRIDE];
  __shared__ uint s_val[2 * SAMPLE_STRIDE];
  __shared__ uint s_inf[2 * SAMPLE_STRIDE];
-  const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
+    if (threadIdx.x == 0) {
-  const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
+        uint segmentElementsA = stride;
-  d_SrcKey += segmentBase;
+        uint segmentElementsB = umin(stride, N - segmentBase - stride);
-  d_SrcVal += segmentBase;
+        uint segmentSamplesA  = stride / SAMPLE_STRIDE;
-  d_DstKey += segmentBase;
+        uint segmentSamplesB  = getSampleCount(segmentElementsB);
-  d_DstVal += segmentBase;
+        uint segmentSamples   = segmentSamplesA + segmentSamplesB;
-  // Set up threadblock-wide parameters
+        startSrcA = d_LimitsA[blockIdx.x];
-  __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
+        startSrcB = d_LimitsB[blockIdx.x];
        startDst  = startSrcA + startSrcB;
-  if (threadIdx.x == 0) {
+        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
-    uint segmentElementsA = stride;
+        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
-    uint segmentElementsB = umin(stride, N - segmentBase - stride);
+        lenSrcA      = endSrcA - startSrcA;
-    uint segmentSamplesA = stride / SAMPLE_STRIDE;
+        lenSrcB      = endSrcB - startSrcB;
-    uint segmentSamplesB = getSampleCount(segmentElementsB);
+    }
    uint segmentSamples = segmentSamplesA + segmentSamplesB;
-    startSrcA = d_LimitsA[blockIdx.x];
+    s_inf[threadIdx.x + 0]             = 1;
-    startSrcB = d_LimitsB[blockIdx.x];
+    s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
    startDst = startSrcA + startSrcB;
-    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
+    // Load input data
                                                    : segmentElementsA;
    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
                                                    : segmentElementsB;
    lenSrcA = endSrcA - startSrcA;
    lenSrcB = endSrcB - startSrcB;
  }
  s_inf[threadIdx.x + 0] = 1;
  s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
  // Load input data
  cg::sync(cta);
  if (threadIdx.x < lenSrcA) {
    s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
    s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
    s_inf[threadIdx.x] = 0;
  }
  // Prepare for bitonic merge by inversing the ordering
  if (threadIdx.x < lenSrcB) {
    s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
        d_SrcKey[stride + startSrcB + threadIdx.x];
    s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
        d_SrcVal[stride + startSrcB + threadIdx.x];
    s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
  }
  //"Extended" bitonic merge
  for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
    cg::sync(cta);
    uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
    ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
                                s_key[pos + stride], s_val[pos + stride],
                                s_inf[pos + stride], sortDir);
  }
-  // Store sorted data
+    if (threadIdx.x < lenSrcA) {
-  cg::sync(cta);
+        s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
-  d_DstKey += startDst;
+        s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
-  d_DstVal += startDst;
+        s_inf[threadIdx.x] = 0;
    }
-  if (threadIdx.x < lenSrcA) {
+    // Prepare for bitonic merge by inversing the ordering
-    d_DstKey[threadIdx.x] = s_key[threadIdx.x];
+    if (threadIdx.x < lenSrcB) {
-    d_DstVal[threadIdx.x] = s_val[threadIdx.x];
+        s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
-  }
+        s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
        s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
    }
-  if (threadIdx.x < lenSrcB) {
+    //"Extended" bitonic merge
-    d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
+    for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
-    d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
+        cg::sync(cta);
-  }
+        uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
        ComparatorExtended<sortDir>(s_key[pos + 0],
                                    s_val[pos + 0],
                                    s_inf[pos + 0],
                                    s_key[pos + stride],
                                    s_val[pos + stride],
                                    s_inf[pos + stride],
                                    sortDir);
    }
    // Store sorted data
    cg::sync(cta);
    d_DstKey += startDst;
    d_DstVal += startDst;
    if (threadIdx.x < lenSrcA) {
        d_DstKey[threadIdx.x] = s_key[threadIdx.x];
        d_DstVal[threadIdx.x] = s_val[threadIdx.x];
    }
    if (threadIdx.x < lenSrcB) {
        d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
        d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
    }
 }
-extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
+extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
-                                                uint *d_SrcKey, uint *d_SrcVal,
+                                                uint *d_DstVal,
                                                uint *d_SrcKey,
                                                uint *d_SrcVal,
                                                uint *d_LimitsA,
-                                                uint *d_LimitsB, uint stride,
+                                                uint *d_LimitsB,
-                                                uint N, uint sortDir) {
+                                                uint  stride,
-  uint lastSegmentElements = N % (2 * stride);
+                                                uint  N,
                                                uint  sortDir)
 {
    uint lastSegmentElements = N % (2 * stride);
-  uint mergePairs = (lastSegmentElements > stride)
+    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
                        ? getSampleCount(N)
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
-  if (sortDir) {
+    if (sortDir) {
-    bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
+        bitonicMergeElementaryIntervalsKernel<1U>
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
-        N);
+        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
-    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
+    }
-  } else {
+    else {
-    bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
+        bitonicMergeElementaryIntervalsKernel<0U>
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
+            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
-        N);
+        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
-    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
+    }
  }
 }
--- a/Samples/0_Introduction/mergeSort/main.cpp
+++ b/Samples/0_Introduction/mergeSort/main.cpp
@ -26,96 +26,94 @@
 */
 #include <assert.h>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <cuda_runtime.h>
+
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Test driver
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
+{
-  uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
+    uint               *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
-  StopWatchInterface *hTimer = NULL;
+    uint               *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
    StopWatchInterface *hTimer = NULL;
-  const uint N = 4 * 1048576;
+    const uint N         = 4 * 1048576;
-  const uint DIR = 1;
+    const uint DIR       = 1;
-  const uint numValues = 65536;
+    const uint numValues = 65536;
-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);
-  int dev = findCudaDevice(argc, (const char **)argv);
+    int dev = findCudaDevice(argc, (const char **)argv);
-  if (dev == -1) {
+    if (dev == -1) {
-    return EXIT_FAILURE;
+        return EXIT_FAILURE;
-  }
+    }
-  printf("Allocating and initializing host arrays...\n\n");
+    printf("Allocating and initializing host arrays...\n\n");
-  sdkCreateTimer(&hTimer);
+    sdkCreateTimer(&hTimer);
-  h_SrcKey = (uint *)malloc(N * sizeof(uint));
+    h_SrcKey = (uint *)malloc(N * sizeof(uint));
-  h_SrcVal = (uint *)malloc(N * sizeof(uint));
+    h_SrcVal = (uint *)malloc(N * sizeof(uint));
-  h_DstKey = (uint *)malloc(N * sizeof(uint));
+    h_DstKey = (uint *)malloc(N * sizeof(uint));
-  h_DstVal = (uint *)malloc(N * sizeof(uint));
+    h_DstVal = (uint *)malloc(N * sizeof(uint));
-  srand(2009);
+    srand(2009);
-  for (uint i = 0; i < N; i++) {
+    for (uint i = 0; i < N; i++) {
-    h_SrcKey[i] = rand() % numValues;
+        h_SrcKey[i] = rand() % numValues;
-  }
+    }
-  fillValues(h_SrcVal, N);
+    fillValues(h_SrcVal, N);
-  printf("Allocating and initializing CUDA arrays...\n\n");
+    printf("Allocating and initializing CUDA arrays...\n\n");
-  checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
-  checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
+    checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
-      cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
  checkCudaErrors(
      cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
-  printf("Initializing GPU merge sort...\n");
+    printf("Initializing GPU merge sort...\n");
-  initMergeSort();
+    initMergeSort();
-  printf("Running GPU merge sort...\n");
+    printf("Running GPU merge sort...\n");
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  sdkResetTimer(&hTimer);
+    sdkResetTimer(&hTimer);
-  sdkStartTimer(&hTimer);
+    sdkStartTimer(&hTimer);
-  mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
+    mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&hTimer);
+    sdkStopTimer(&hTimer);
-  printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
+    printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
-  printf("Reading back GPU merge sort results...\n");
+    printf("Reading back GPU merge sort results...\n");
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
-      cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
  checkCudaErrors(
      cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
-  printf("Inspecting the results...\n");
+    printf("Inspecting the results...\n");
-  uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
+    uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
-  uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
+    uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
-  printf("Shutting down...\n");
+    printf("Shutting down...\n");
-  closeMergeSort();
+    closeMergeSort();
-  sdkDeleteTimer(&hTimer);
+    sdkDeleteTimer(&hTimer);
-  checkCudaErrors(cudaFree(d_SrcVal));
+    checkCudaErrors(cudaFree(d_SrcVal));
-  checkCudaErrors(cudaFree(d_SrcKey));
+    checkCudaErrors(cudaFree(d_SrcKey));
-  checkCudaErrors(cudaFree(d_BufVal));
+    checkCudaErrors(cudaFree(d_BufVal));
-  checkCudaErrors(cudaFree(d_BufKey));
+    checkCudaErrors(cudaFree(d_BufKey));
-  checkCudaErrors(cudaFree(d_DstVal));
+    checkCudaErrors(cudaFree(d_DstVal));
-  checkCudaErrors(cudaFree(d_DstKey));
+    checkCudaErrors(cudaFree(d_DstKey));
-  free(h_DstVal);
+    free(h_DstVal);
-  free(h_DstKey);
+    free(h_DstKey);
-  free(h_SrcVal);
+    free(h_SrcVal);
-  free(h_SrcKey);
+    free(h_SrcKey);
-  exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/mergeSort/mergeSort.cu
+++ b/Samples/0_Introduction/mergeSort/mergeSort.cu
@ -39,491 +39,499 @@
 namespace cg = cooperative_groups;
 #include <helper_cuda.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static inline __host__ __device__ uint iDivUp(uint a, uint b) {
+static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
  return ((a % b) == 0) ? (a / b) : (a / b + 1);
 }
-static inline __host__ __device__ uint getSampleCount(uint dividend) {
+static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
  return iDivUp(dividend, SAMPLE_STRIDE);
 }
 #define W (sizeof(uint) * 8)
-static inline __device__ uint nextPowerOfTwo(uint x) {
+static inline __device__ uint nextPowerOfTwo(uint x)
-  /*
+{
-      --x;
+    /*
-      x |= x >> 1;
+        --x;
-      x |= x >> 2;
+        x |= x >> 1;
-      x |= x >> 4;
+        x |= x >> 2;
-      x |= x >> 8;
+        x |= x >> 4;
-      x |= x >> 16;
+        x |= x >> 8;
-      return ++x;
+        x |= x >> 16;
-  */
+        return ++x;
-  return 1U << (W - __clz(x - 1));
+    */
    return 1U << (W - __clz(x - 1));
 }
-template <uint sortDir>
+template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
-static inline __device__ uint binarySearchInclusive(uint val, uint *data,
+{
-                                                    uint L, uint stride) {
+    if (L == 0) {
-  if (L == 0) {
+        return 0;
    return 0;
  }
  uint pos = 0;
  for (; stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);
    if ((sortDir && (data[newPos - 1] <= val)) ||
        (!sortDir && (data[newPos - 1] >= val))) {
      pos = newPos;
    }
  }
-  return pos;
+    uint pos = 0;
    for (; stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
            pos = newPos;
        }
    }
    return pos;
 }
-template <uint sortDir>
+template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
-static inline __device__ uint binarySearchExclusive(uint val, uint *data,
+{
-                                                    uint L, uint stride) {
+    if (L == 0) {
-  if (L == 0) {
+        return 0;
    return 0;
  }
  uint pos = 0;
  for (; stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);
    if ((sortDir && (data[newPos - 1] < val)) ||
        (!sortDir && (data[newPos - 1] > val))) {
      pos = newPos;
    }
  }
-  return pos;
+    uint pos = 0;
    for (; stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
            pos = newPos;
        }
    }
    return pos;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Bottom-level merge sort (binary search-based)
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
+__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
-                                      uint *d_SrcKey, uint *d_SrcVal,
+{
-                                      uint arrayLength) {
+    // Handle to thread block group
-  // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
-  cg::thread_block cta = cg::this_thread_block();
+    __shared__ uint  s_key[SHARED_SIZE_LIMIT];
-  __shared__ uint s_key[SHARED_SIZE_LIMIT];
+    __shared__ uint  s_val[SHARED_SIZE_LIMIT];
  __shared__ uint s_val[SHARED_SIZE_LIMIT];
-  d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
+    d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-  s_key[threadIdx.x + 0] = d_SrcKey[0];
+    s_key[threadIdx.x + 0]                       = d_SrcKey[0];
-  s_val[threadIdx.x + 0] = d_SrcVal[0];
+    s_val[threadIdx.x + 0]                       = d_SrcVal[0];
-  s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
+    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
-      d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
+    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
  s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
      d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
-  for (uint stride = 1; stride < arrayLength; stride <<= 1) {
+    for (uint stride = 1; stride < arrayLength; stride <<= 1) {
-    uint lPos = threadIdx.x & (stride - 1);
+        uint  lPos    = threadIdx.x & (stride - 1);
-    uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
+        uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
-    uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
+        uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
        cg::sync(cta);
        uint keyA = baseKey[lPos + 0];
        uint valA = baseVal[lPos + 0];
        uint keyB = baseKey[lPos + stride];
        uint valB = baseVal[lPos + stride];
        uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
        uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
        cg::sync(cta);
        baseKey[posA] = keyA;
        baseVal[posA] = valA;
        baseKey[posB] = keyB;
        baseVal[posB] = valB;
    }
    cg::sync(cta);
-    uint keyA = baseKey[lPos + 0];
+    d_DstKey[0]                       = s_key[threadIdx.x + 0];
-    uint valA = baseVal[lPos + 0];
+    d_DstVal[0]                       = s_val[threadIdx.x + 0];
-    uint keyB = baseKey[lPos + stride];
+    d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-    uint valB = baseVal[lPos + stride];
+    d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
    uint posA =
        binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
        lPos;
    uint posB =
        binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
        lPos;
    cg::sync(cta);
    baseKey[posA] = keyA;
    baseVal[posA] = valA;
    baseKey[posB] = keyB;
    baseVal[posB] = valB;
  }
  cg::sync(cta);
  d_DstKey[0] = s_key[threadIdx.x + 0];
  d_DstVal[0] = s_val[threadIdx.x + 0];
  d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
      s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
  d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
      s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
 }
-static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
+static void mergeSortShared(uint *d_DstKey,
-                            uint *d_SrcVal, uint batchSize, uint arrayLength,
+                            uint *d_DstVal,
-                            uint sortDir) {
+                            uint *d_SrcKey,
-  if (arrayLength < 2) {
+                            uint *d_SrcVal,
-    return;
+                            uint  batchSize,
-  }
+                            uint  arrayLength,
                            uint  sortDir)
 {
    if (arrayLength < 2) {
        return;
    }
-  assert(SHARED_SIZE_LIMIT % arrayLength == 0);
+    assert(SHARED_SIZE_LIMIT % arrayLength == 0);
-  assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
+    assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
-  uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
+    uint blockCount  = batchSize * arrayLength / SHARED_SIZE_LIMIT;
-  uint threadCount = SHARED_SIZE_LIMIT / 2;
+    uint threadCount = SHARED_SIZE_LIMIT / 2;
-  if (sortDir) {
+    if (sortDir) {
-    mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
+        mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+        getLastCudaError("mergeSortShared<1><<<>>> failed\n");
-    getLastCudaError("mergeSortShared<1><<<>>> failed\n");
+    }
-  } else {
+    else {
-    mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
+        mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
-        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
+        getLastCudaError("mergeSortShared<0><<<>>> failed\n");
-    getLastCudaError("mergeSortShared<0><<<>>> failed\n");
+    }
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: generate sample ranks
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
+__global__ void
-                                          uint *d_SrcKey, uint stride, uint N,
+generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
-                                          uint threadCount) {
+{
-  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
-  if (pos >= threadCount) {
+    if (pos >= threadCount) {
-    return;
+        return;
-  }
+    }
-  const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
-  const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
-  d_SrcKey += segmentBase;
+    d_SrcKey += segmentBase;
-  d_RanksA += segmentBase / SAMPLE_STRIDE;
+    d_RanksA += segmentBase / SAMPLE_STRIDE;
-  d_RanksB += segmentBase / SAMPLE_STRIDE;
+    d_RanksB += segmentBase / SAMPLE_STRIDE;
-  const uint segmentElementsA = stride;
+    const uint segmentElementsA = stride;
-  const uint segmentElementsB = umin(stride, N - segmentBase - stride);
+    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
-  const uint segmentSamplesA = getSampleCount(segmentElementsA);
+    const uint segmentSamplesA  = getSampleCount(segmentElementsA);
-  const uint segmentSamplesB = getSampleCount(segmentElementsB);
+    const uint segmentSamplesB  = getSampleCount(segmentElementsB);
-  if (i < segmentSamplesA) {
+    if (i < segmentSamplesA) {
-    d_RanksA[i] = i * SAMPLE_STRIDE;
+        d_RanksA[i] = i * SAMPLE_STRIDE;
-    d_RanksB[i] = binarySearchExclusive<sortDir>(
+        d_RanksB[i] = binarySearchExclusive<sortDir>(
-        d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
+            d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
-        nextPowerOfTwo(segmentElementsB));
+    }
  }
-  if (i < segmentSamplesB) {
+    if (i < segmentSamplesB) {
-    d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
+        d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
-    d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
+        d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
-        d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
+            d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
-        nextPowerOfTwo(segmentElementsA));
+    }
  }
 }
-static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
+static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
-                                uint stride, uint N, uint sortDir) {
+{
-  uint lastSegmentElements = N % (2 * stride);
+    uint lastSegmentElements = N % (2 * stride);
-  uint threadCount =
+    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-      (lastSegmentElements > stride)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
-  if (sortDir) {
+    if (sortDir) {
-    generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
+        generateSampleRanksKernel<1U>
-        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
-    getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
+        getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
-  } else {
+    }
-    generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
+    else {
-        d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
+        generateSampleRanksKernel<0U>
-    getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
+            <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
-  }
+        getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: generate sample ranks and indices
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
+__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
-                                           uint stride, uint N,
+{
-                                           uint threadCount) {
+    uint pos = blockIdx.x * blockDim.x + threadIdx.x;
  uint pos = blockIdx.x * blockDim.x + threadIdx.x;
-  if (pos >= threadCount) {
+    if (pos >= threadCount) {
-    return;
+        return;
-  }
+    }
-  const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
+    const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
-  const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
-  d_Ranks += (pos - i) * 2;
+    d_Ranks += (pos - i) * 2;
-  d_Limits += (pos - i) * 2;
+    d_Limits += (pos - i) * 2;
-  const uint segmentElementsA = stride;
+    const uint segmentElementsA = stride;
-  const uint segmentElementsB = umin(stride, N - segmentBase - stride);
+    const uint segmentElementsB = umin(stride, N - segmentBase - stride);
-  const uint segmentSamplesA = getSampleCount(segmentElementsA);
+    const uint segmentSamplesA  = getSampleCount(segmentElementsA);
-  const uint segmentSamplesB = getSampleCount(segmentElementsB);
+    const uint segmentSamplesB  = getSampleCount(segmentElementsB);
-  if (i < segmentSamplesA) {
+    if (i < segmentSamplesA) {
-    uint dstPos = binarySearchExclusive<1U>(
+        uint dstPos = binarySearchExclusive<1U>(
-                      d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
+                          d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
-                      nextPowerOfTwo(segmentSamplesB)) +
+                    + i;
-                  i;
+        d_Limits[dstPos] = d_Ranks[i];
-    d_Limits[dstPos] = d_Ranks[i];
+    }
  }
-  if (i < segmentSamplesB) {
+    if (i < segmentSamplesB) {
-    uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
+        uint dstPos = binarySearchInclusive<1U>(
-                                            d_Ranks, segmentSamplesA,
+                          d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
-                                            nextPowerOfTwo(segmentSamplesA)) +
+                    + i;
-                  i;
+        d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
-    d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
+    }
  }
 }
-static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
+static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
-                                 uint *d_RanksA, uint *d_RanksB, uint stride,
+{
-                                 uint N) {
+    uint lastSegmentElements = N % (2 * stride);
-  uint lastSegmentElements = N % (2 * stride);
+    uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-  uint threadCount =
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
      (lastSegmentElements > stride)
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
-  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
+    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
-      d_LimitsA, d_RanksA, stride, N, threadCount);
+    getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
  getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
-  mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
+    mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
-      d_LimitsB, d_RanksB, stride, N, threadCount);
+    getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
  getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
 template <uint sortDir>
-inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
+inline __device__ void merge(uint            *dstKey,
-                             uint *srcAVal, uint *srcBKey, uint *srcBVal,
+                             uint            *dstVal,
-                             uint lenA, uint nPowTwoLenA, uint lenB,
+                             uint            *srcAKey,
-                             uint nPowTwoLenB, cg::thread_block cta) {
+                             uint            *srcAVal,
-  uint keyA, valA, keyB, valB, dstPosA, dstPosB;
+                             uint            *srcBKey,
                             uint            *srcBVal,
                             uint             lenA,
                             uint             nPowTwoLenA,
                             uint             lenB,
                             uint             nPowTwoLenB,
                             cg::thread_block cta)
 {
    uint keyA, valA, keyB, valB, dstPosA, dstPosB;
-  if (threadIdx.x < lenA) {
+    if (threadIdx.x < lenA) {
-    keyA = srcAKey[threadIdx.x];
+        keyA    = srcAKey[threadIdx.x];
-    valA = srcAVal[threadIdx.x];
+        valA    = srcAVal[threadIdx.x];
-    dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
+        dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
-              threadIdx.x;
+    }
  }
-  if (threadIdx.x < lenB) {
+    if (threadIdx.x < lenB) {
-    keyB = srcBKey[threadIdx.x];
+        keyB    = srcBKey[threadIdx.x];
-    valB = srcBVal[threadIdx.x];
+        valB    = srcBVal[threadIdx.x];
-    dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
+        dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
-              threadIdx.x;
+    }
  }
-  cg::sync(cta);
+    cg::sync(cta);
-  if (threadIdx.x < lenA) {
+    if (threadIdx.x < lenA) {
-    dstKey[dstPosA] = keyA;
+        dstKey[dstPosA] = keyA;
-    dstVal[dstPosA] = valA;
+        dstVal[dstPosA] = valA;
-  }
+    }
-  if (threadIdx.x < lenB) {
+    if (threadIdx.x < lenB) {
-    dstKey[dstPosB] = keyB;
+        dstKey[dstPosB] = keyB;
-    dstVal[dstPosB] = valB;
+        dstVal[dstPosB] = valB;
-  }
+    }
 }
 template <uint sortDir>
-__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
+__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
-                                               uint *d_SrcKey, uint *d_SrcVal,
+                                               uint *d_DstVal,
-                                               uint *d_LimitsA, uint *d_LimitsB,
+                                               uint *d_SrcKey,
-                                               uint stride, uint N) {
+                                               uint *d_SrcVal,
-  // Handle to thread block group
+                                               uint *d_LimitsA,
-  cg::thread_block cta = cg::this_thread_block();
+                                               uint *d_LimitsB,
-  __shared__ uint s_key[2 * SAMPLE_STRIDE];
+                                               uint  stride,
-  __shared__ uint s_val[2 * SAMPLE_STRIDE];
+                                               uint  N)
 {
    // Handle to thread block group
    cg::thread_block cta = cg::this_thread_block();
    __shared__ uint  s_key[2 * SAMPLE_STRIDE];
    __shared__ uint  s_val[2 * SAMPLE_STRIDE];
-  const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
+    const uint intervalI   = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
-  const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
+    const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
-  d_SrcKey += segmentBase;
+    d_SrcKey += segmentBase;
-  d_SrcVal += segmentBase;
+    d_SrcVal += segmentBase;
-  d_DstKey += segmentBase;
+    d_DstKey += segmentBase;
-  d_DstVal += segmentBase;
+    d_DstVal += segmentBase;
-  // Set up threadblock-wide parameters
+    // Set up threadblock-wide parameters
-  __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
+    __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
-  if (threadIdx.x == 0) {
+    if (threadIdx.x == 0) {
-    uint segmentElementsA = stride;
+        uint segmentElementsA = stride;
-    uint segmentElementsB = umin(stride, N - segmentBase - stride);
+        uint segmentElementsB = umin(stride, N - segmentBase - stride);
-    uint segmentSamplesA = getSampleCount(segmentElementsA);
+        uint segmentSamplesA  = getSampleCount(segmentElementsA);
-    uint segmentSamplesB = getSampleCount(segmentElementsB);
+        uint segmentSamplesB  = getSampleCount(segmentElementsB);
-    uint segmentSamples = segmentSamplesA + segmentSamplesB;
+        uint segmentSamples   = segmentSamplesA + segmentSamplesB;
-    startSrcA = d_LimitsA[blockIdx.x];
+        startSrcA    = d_LimitsA[blockIdx.x];
-    startSrcB = d_LimitsB[blockIdx.x];
+        startSrcB    = d_LimitsB[blockIdx.x];
-    uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
+        uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
-                                                    : segmentElementsA;
+        uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
-    uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
+        lenSrcA      = endSrcA - startSrcA;
-                                                    : segmentElementsB;
+        lenSrcB      = endSrcB - startSrcB;
-    lenSrcA = endSrcA - startSrcA;
+        startDstA    = startSrcA + startSrcB;
-    lenSrcB = endSrcB - startSrcB;
+        startDstB    = startDstA + lenSrcA;
    startDstA = startSrcA + startSrcB;
    startDstB = startDstA + lenSrcA;
  }
  // Load main input data
  cg::sync(cta);
  if (threadIdx.x < lenSrcA) {
    s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
    s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
  }
  if (threadIdx.x < lenSrcB) {
    s_key[threadIdx.x + SAMPLE_STRIDE] =
        d_SrcKey[stride + startSrcB + threadIdx.x];
    s_val[threadIdx.x + SAMPLE_STRIDE] =
        d_SrcVal[stride + startSrcB + threadIdx.x];
  }
  // Merge data in shared memory
  cg::sync(cta);
  merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
                 s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
                 SAMPLE_STRIDE, cta);
  // Store merged data
  cg::sync(cta);
  if (threadIdx.x < lenSrcA) {
    d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
    d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
  }
  if (threadIdx.x < lenSrcB) {
    d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
    d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
  }
 }
 static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
                                     uint *d_SrcKey, uint *d_SrcVal,
                                     uint *d_LimitsA, uint *d_LimitsB,
                                     uint stride, uint N, uint sortDir) {
  uint lastSegmentElements = N % (2 * stride);
  uint mergePairs = (lastSegmentElements > stride)
                        ? getSampleCount(N)
                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
  if (sortDir) {
    mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
        N);
    getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
  } else {
    mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
        d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
        N);
    getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
  }
 }
 extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
                                  uint *d_SrcKey, uint *d_SrcVal,
                                  uint batchSize, uint arrayLength,
                                  uint sortDir);
 extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
                                                uint *d_SrcKey, uint *d_SrcVal,
                                                uint *d_LimitsA,
                                                uint *d_LimitsB, uint stride,
                                                uint N, uint sortDir);
 static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
 static const uint MAX_SAMPLE_COUNT = 32768;
 extern "C" void initMergeSort(void) {
  checkCudaErrors(
      cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
  checkCudaErrors(
      cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
  checkCudaErrors(
      cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
  checkCudaErrors(
      cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
 }
 extern "C" void closeMergeSort(void) {
  checkCudaErrors(cudaFree(d_RanksA));
  checkCudaErrors(cudaFree(d_RanksB));
  checkCudaErrors(cudaFree(d_LimitsB));
  checkCudaErrors(cudaFree(d_LimitsA));
 }
 extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
                          uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
                          uint N, uint sortDir) {
  uint stageCount = 0;
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
    ;
  uint *ikey, *ival, *okey, *oval;
  if (stageCount & 1) {
    ikey = d_BufKey;
    ival = d_BufVal;
    okey = d_DstKey;
    oval = d_DstVal;
  } else {
    ikey = d_DstKey;
    ival = d_DstVal;
    okey = d_BufKey;
    oval = d_BufVal;
  }
  assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
  assert(N % SHARED_SIZE_LIMIT == 0);
  mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
                  SHARED_SIZE_LIMIT, sortDir);
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
    uint lastSegmentElements = N % (2 * stride);
    // Find sample ranks and prepare for limiters merge
    generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
    // Merge ranks and indices
    mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
    // Merge elementary intervals
    mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
                             stride, N, sortDir);
    if (lastSegmentElements <= stride) {
      // Last merge segment consists of a single array which just needs to be
      // passed through
      checkCudaErrors(cudaMemcpy(
          okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
      checkCudaErrors(cudaMemcpy(
          oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
          lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
    }
-    uint *t;
+    // Load main input data
-    t = ikey;
+    cg::sync(cta);
-    ikey = okey;
+
-    okey = t;
+    if (threadIdx.x < lenSrcA) {
-    t = ival;
+        s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
-    ival = oval;
+        s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
-    oval = t;
+    }
-  }
+
    if (threadIdx.x < lenSrcB) {
        s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
        s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
    }
    // Merge data in shared memory
    cg::sync(cta);
    merge<sortDir>(s_key,
                   s_val,
                   s_key + 0,
                   s_val + 0,
                   s_key + SAMPLE_STRIDE,
                   s_val + SAMPLE_STRIDE,
                   lenSrcA,
                   SAMPLE_STRIDE,
                   lenSrcB,
                   SAMPLE_STRIDE,
                   cta);
    // Store merged data
    cg::sync(cta);
    if (threadIdx.x < lenSrcA) {
        d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
        d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
    }
    if (threadIdx.x < lenSrcB) {
        d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
        d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
    }
 }
 static void mergeElementaryIntervals(uint *d_DstKey,
                                     uint *d_DstVal,
                                     uint *d_SrcKey,
                                     uint *d_SrcVal,
                                     uint *d_LimitsA,
                                     uint *d_LimitsB,
                                     uint  stride,
                                     uint  N,
                                     uint  sortDir)
 {
    uint lastSegmentElements = N % (2 * stride);
    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
    if (sortDir) {
        mergeElementaryIntervalsKernel<1U>
            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
        getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
    }
    else {
        mergeElementaryIntervalsKernel<0U>
            <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
        getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
    }
 }
 extern "C" void bitonicSortShared(uint *d_DstKey,
                                  uint *d_DstVal,
                                  uint *d_SrcKey,
                                  uint *d_SrcVal,
                                  uint  batchSize,
                                  uint  arrayLength,
                                  uint  sortDir);
 extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
                                                uint *d_DstVal,
                                                uint *d_SrcKey,
                                                uint *d_SrcVal,
                                                uint *d_LimitsA,
                                                uint *d_LimitsB,
                                                uint  stride,
                                                uint  N,
                                                uint  sortDir);
 static uint      *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
 static const uint MAX_SAMPLE_COUNT = 32768;
 extern "C" void initMergeSort(void)
 {
    checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
    checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
 }
 extern "C" void closeMergeSort(void)
 {
    checkCudaErrors(cudaFree(d_RanksA));
    checkCudaErrors(cudaFree(d_RanksB));
    checkCudaErrors(cudaFree(d_LimitsB));
    checkCudaErrors(cudaFree(d_LimitsA));
 }
 extern "C" void mergeSort(uint *d_DstKey,
                          uint *d_DstVal,
                          uint *d_BufKey,
                          uint *d_BufVal,
                          uint *d_SrcKey,
                          uint *d_SrcVal,
                          uint  N,
                          uint  sortDir)
 {
    uint stageCount = 0;
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
        ;
    uint *ikey, *ival, *okey, *oval;
    if (stageCount & 1) {
        ikey = d_BufKey;
        ival = d_BufVal;
        okey = d_DstKey;
        oval = d_DstVal;
    }
    else {
        ikey = d_DstKey;
        ival = d_DstVal;
        okey = d_BufKey;
        oval = d_BufVal;
    }
    assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
    assert(N % SHARED_SIZE_LIMIT == 0);
    mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
        uint lastSegmentElements = N % (2 * stride);
        // Find sample ranks and prepare for limiters merge
        generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
        // Merge ranks and indices
        mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
        // Merge elementary intervals
        mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
        if (lastSegmentElements <= stride) {
            // Last merge segment consists of a single array which just needs to be
            // passed through
            checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
                                       ikey + (N - lastSegmentElements),
                                       lastSegmentElements * sizeof(uint),
                                       cudaMemcpyDeviceToDevice));
            checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
                                       ival + (N - lastSegmentElements),
                                       lastSegmentElements * sizeof(uint),
                                       cudaMemcpyDeviceToDevice));
        }
        uint *t;
        t    = ikey;
        ikey = okey;
        okey = t;
        t    = ival;
        ival = oval;
        oval = t;
    }
 }
--- a/Samples/0_Introduction/mergeSort/mergeSort_common.h
+++ b/Samples/0_Introduction/mergeSort/mergeSort_common.h
@ -31,19 +31,17 @@
 typedef unsigned int uint;
 #define SHARED_SIZE_LIMIT 1024U
-#define SAMPLE_STRIDE 128
+#define SAMPLE_STRIDE     128
 ////////////////////////////////////////////////////////////////////////////////
 // Extensive sort validation routine
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
+extern "C" uint
-                                   uint arrayLength, uint numValues,
+validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
                                   uint sortDir);
 extern "C" void fillValues(uint *val, uint N);
-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
                                    uint batchSize, uint arrayLength);
 ////////////////////////////////////////////////////////////////////////////////
 // CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
 extern "C" void closeMergeSort(void);
-extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
+extern "C" void
-                          uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
                          uint sortDir);
 ////////////////////////////////////////////////////////////////////////////////
 // CPU "emulation"
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
+extern "C" void
-                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
                              uint sortDir);
--- a/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_host.cpp
@ -29,329 +29,335 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Helper functions
 ////////////////////////////////////////////////////////////////////////////////
-static void checkOrder(uint *data, uint N, uint sortDir) {
+static void checkOrder(uint *data, uint N, uint sortDir)
-  if (N <= 1) {
+{
-    return;
+    if (N <= 1) {
-  }
+        return;
  for (uint i = 0; i < N - 1; i++)
    if ((sortDir && (data[i] > data[i + 1])) ||
        (!sortDir && (data[i] < data[i + 1]))) {
      fprintf(stderr, "checkOrder() failed!!!\n");
      exit(EXIT_FAILURE);
    }
    for (uint i = 0; i < N - 1; i++)
        if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
            fprintf(stderr, "checkOrder() failed!!!\n");
            exit(EXIT_FAILURE);
        }
 }
 static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
-static uint getSampleCount(uint dividend) {
+static uint getSampleCount(uint dividend)
-  return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
+{
-                                           : (dividend / SAMPLE_STRIDE);
+    return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
 }
-static uint nextPowerOfTwo(uint x) {
+static uint nextPowerOfTwo(uint x)
-  --x;
+{
-  x |= x >> 1;
+    --x;
-  x |= x >> 2;
+    x |= x >> 1;
-  x |= x >> 4;
+    x |= x >> 2;
-  x |= x >> 8;
+    x |= x >> 4;
-  x |= x >> 16;
+    x |= x >> 8;
-  return ++x;
+    x |= x >> 16;
    return ++x;
 }
-static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
+static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
-  if (L == 0) {
+{
-    return 0;
+    if (L == 0) {
-  }
+        return 0;
  uint pos = 0;
  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);
    if ((sortDir && (data[newPos - 1] <= val)) ||
        (!sortDir && (data[newPos - 1] >= val))) {
      pos = newPos;
    }
  }
-  return pos;
+    uint pos = 0;
    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
        if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
            pos = newPos;
        }
    }
    return pos;
 }
-static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
+static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
-  if (L == 0) {
+{
-    return 0;
+    if (L == 0) {
-  }
+        return 0;
  uint pos = 0;
  for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
    uint newPos = umin(pos + stride, L);
    if ((sortDir && (data[newPos - 1] < val)) ||
        (!sortDir && (data[newPos - 1] > val))) {
      pos = newPos;
    }
  }
-  return pos;
+    uint pos = 0;
    for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
        uint newPos = umin(pos + stride, L);
        if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
            pos = newPos;
        }
    }
    return pos;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 1: find sample ranks in each segment
 ////////////////////////////////////////////////////////////////////////////////
-static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
+static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
-                                uint stride, uint N, uint sortDir) {
+{
-  uint lastSegmentElements = N % (2 * stride);
+    uint lastSegmentElements = N % (2 * stride);
-  uint sampleCount =
+    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-      (lastSegmentElements > stride)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
-  for (uint pos = 0; pos < sampleCount; pos++) {
+    for (uint pos = 0; pos < sampleCount; pos++) {
-    const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
+        const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
-    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+        const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
-    const uint lenA = stride;
+        const uint lenA = stride;
-    const uint lenB = umin(stride, N - segmentBase - stride);
+        const uint lenB = umin(stride, N - segmentBase - stride);
-    const uint nA = stride / SAMPLE_STRIDE;
+        const uint nA   = stride / SAMPLE_STRIDE;
-    const uint nB = getSampleCount(lenB);
+        const uint nB   = getSampleCount(lenB);
-    if (i < nA) {
+        if (i < nA) {
-      ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
+            ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
-      ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
+            ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
-          binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
+                srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
-                                srcKey + segmentBase + stride, lenB, sortDir);
+        }
        if (i < nB) {
            ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
            ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
                srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
        }
    }
    if (i < nB) {
      ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
      ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
          binarySearchInclusive(
              srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
              srcKey + segmentBase, lenA, sortDir);
    }
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 2: merge ranks and indices to derive elementary intervals
 ////////////////////////////////////////////////////////////////////////////////
-static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
+static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
-                                 uint N) {
+{
-  uint lastSegmentElements = N % (2 * stride);
+    uint lastSegmentElements = N % (2 * stride);
-  uint sampleCount =
+    uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
-      (lastSegmentElements > stride)
+                                                      : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
          ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
          : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
-  for (uint pos = 0; pos < sampleCount; pos++) {
+    for (uint pos = 0; pos < sampleCount; pos++) {
-    const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
+        const uint i           = pos & ((stride / SAMPLE_STRIDE) - 1);
-    const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
+        const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
-    const uint lenA = stride;
+        const uint lenA = stride;
-    const uint lenB = umin(stride, N - segmentBase - stride);
+        const uint lenB = umin(stride, N - segmentBase - stride);
-    const uint nA = stride / SAMPLE_STRIDE;
+        const uint nA   = stride / SAMPLE_STRIDE;
-    const uint nB = getSampleCount(lenB);
+        const uint nB   = getSampleCount(lenB);
-    if (i < nA) {
+        if (i < nA) {
-      uint dstPosA =
+            uint dstPosA =
-          binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
+                binarySearchExclusive(
-                                ranks + (segmentBase + stride) / SAMPLE_STRIDE,
+                    ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
-                                nB, 1) +
+                + i;
-          i;
+            assert(dstPosA < nA + nB);
-      assert(dstPosA < nA + nB);
+            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
-      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
+        }
-          ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
+
        if (i < nB) {
            uint dstPosA =
                binarySearchInclusive(
                    ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
                + i;
            assert(dstPosA < nA + nB);
            limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
        }
    }
    if (i < nB) {
      uint dstPosA = binarySearchInclusive(
                         ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
                         ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
                     i;
      assert(dstPosA < nA + nB);
      limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
          ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
    }
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
 ////////////////////////////////////////////////////////////////////////////////
-static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
+static void merge(uint *dstKey,
-                  uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
+                  uint *dstVal,
-                  uint sortDir) {
+                  uint *srcAKey,
-  checkOrder(srcAKey, lenA, sortDir);
+                  uint *srcAVal,
-  checkOrder(srcBKey, lenB, sortDir);
+                  uint *srcBKey,
                  uint *srcBVal,
                  uint  lenA,
                  uint  lenB,
                  uint  sortDir)
 {
    checkOrder(srcAKey, lenA, sortDir);
    checkOrder(srcBKey, lenB, sortDir);
-  for (uint i = 0; i < lenA; i++) {
+    for (uint i = 0; i < lenA; i++) {
-    uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
+        uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
-    assert(dstPos < lenA + lenB);
+        assert(dstPos < lenA + lenB);
-    dstKey[dstPos] = srcAKey[i];
+        dstKey[dstPos] = srcAKey[i];
-    dstVal[dstPos] = srcAVal[i];
+        dstVal[dstPos] = srcAVal[i];
-  }
+    }
-  for (uint i = 0; i < lenB; i++) {
+    for (uint i = 0; i < lenB; i++) {
-    uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
+        uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
-    assert(dstPos < lenA + lenB);
+        assert(dstPos < lenA + lenB);
-    dstKey[dstPos] = srcBKey[i];
+        dstKey[dstPos] = srcBKey[i];
-    dstVal[dstPos] = srcBVal[i];
+        dstVal[dstPos] = srcBVal[i];
-  }
+    }
 }
-static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
+static void mergeElementaryIntervals(uint *dstKey,
-                                     uint *srcVal, uint *limitsA, uint *limitsB,
+                                     uint *dstVal,
-                                     uint stride, uint N, uint sortDir) {
+                                     uint *srcKey,
-  uint lastSegmentElements = N % (2 * stride);
+                                     uint *srcVal,
-  uint mergePairs = (lastSegmentElements > stride)
+                                     uint *limitsA,
-                        ? getSampleCount(N)
+                                     uint *limitsB,
-                        : (N - lastSegmentElements) / SAMPLE_STRIDE;
+                                     uint  stride,
                                     uint  N,
                                     uint  sortDir)
 {
    uint lastSegmentElements = N % (2 * stride);
    uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
-  for (uint pos = 0; pos < mergePairs; pos++) {
+    for (uint pos = 0; pos < mergePairs; pos++) {
-    uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
+        uint i           = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
-    uint segmentBase = (pos - i) * SAMPLE_STRIDE;
+        uint segmentBase = (pos - i) * SAMPLE_STRIDE;
-    const uint lenA = stride;
+        const uint lenA = stride;
-    const uint lenB = umin(stride, N - segmentBase - stride);
+        const uint lenB = umin(stride, N - segmentBase - stride);
-    const uint nA = stride / SAMPLE_STRIDE;
+        const uint nA   = stride / SAMPLE_STRIDE;
-    const uint nB = getSampleCount(lenB);
+        const uint nB   = getSampleCount(lenB);
-    const uint n = nA + nB;
+        const uint n    = nA + nB;
-    const uint startPosA = limitsA[pos];
+        const uint startPosA   = limitsA[pos];
-    const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
+        const uint endPosA     = (i + 1 < n) ? limitsA[pos + 1] : lenA;
-    const uint startPosB = limitsB[pos];
+        const uint startPosB   = limitsB[pos];
-    const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
+        const uint endPosB     = (i + 1 < n) ? limitsB[pos + 1] : lenB;
-    const uint startPosDst = startPosA + startPosB;
+        const uint startPosDst = startPosA + startPosB;
-    assert(startPosA <= endPosA && endPosA <= lenA);
+        assert(startPosA <= endPosA && endPosA <= lenA);
-    assert(startPosB <= endPosB && endPosB <= lenB);
+        assert(startPosB <= endPosB && endPosB <= lenB);
-    assert((endPosA - startPosA) <= SAMPLE_STRIDE);
+        assert((endPosA - startPosA) <= SAMPLE_STRIDE);
-    assert((endPosB - startPosB) <= SAMPLE_STRIDE);
+        assert((endPosB - startPosB) <= SAMPLE_STRIDE);
-    merge(dstKey + segmentBase + startPosDst,
+        merge(dstKey + segmentBase + startPosDst,
-          dstVal + segmentBase + startPosDst,
+              dstVal + segmentBase + startPosDst,
-          (srcKey + segmentBase + 0) + startPosA,
+              (srcKey + segmentBase + 0) + startPosA,
-          (srcVal + segmentBase + 0) + startPosA,
+              (srcVal + segmentBase + 0) + startPosA,
-          (srcKey + segmentBase + stride) + startPosB,
+              (srcKey + segmentBase + stride) + startPosB,
-          (srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
+              (srcVal + segmentBase + stride) + startPosB,
-          endPosB - startPosB, sortDir);
+              endPosA - startPosA,
-  }
+              endPosB - startPosB,
              sortDir);
    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Retarded bubble sort
 ////////////////////////////////////////////////////////////////////////////////
-static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
+static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
-  if (N <= 1) {
+{
-    return;
+    if (N <= 1) {
-  }
+        return;
-
+    }
-  for (uint bottom = 0; bottom < N - 1; bottom++) {
+
-    uint savePos = bottom;
+    for (uint bottom = 0; bottom < N - 1; bottom++) {
-    uint saveKey = key[bottom];
+        uint savePos = bottom;
-
+        uint saveKey = key[bottom];
-    for (uint i = bottom + 1; i < N; i++)
+
-      if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
+        for (uint i = bottom + 1; i < N; i++)
-        savePos = i;
+            if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
-        saveKey = key[i];
+                savePos = i;
-      }
+                saveKey = key[i];
-
+            }
-    if (savePos != bottom) {
+
-      uint t;
+        if (savePos != bottom) {
-      t = key[savePos];
+            uint t;
-      key[savePos] = key[bottom];
+            t            = key[savePos];
-      key[bottom] = t;
+            key[savePos] = key[bottom];
-      t = val[savePos];
+            key[bottom]  = t;
-      val[savePos] = val[bottom];
+            t            = val[savePos];
-      val[bottom] = t;
+            val[savePos] = val[bottom];
            val[bottom]  = t;
        }
    }
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Interface function
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
+extern "C" void
-                              uint *bufVal, uint *srcKey, uint *srcVal, uint N,
+mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
-                              uint sortDir) {
+{
-  uint *ikey, *ival, *okey, *oval;
+    uint *ikey, *ival, *okey, *oval;
-  uint stageCount = 0;
+    uint  stageCount = 0;
-  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
+    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
-    ;
+        ;
-  if (stageCount & 1) {
+    if (stageCount & 1) {
-    ikey = bufKey;
+        ikey = bufKey;
-    ival = bufVal;
+        ival = bufVal;
-    okey = dstKey;
+        okey = dstKey;
-    oval = dstVal;
+        oval = dstVal;
-  } else {
+    }
-    ikey = dstKey;
+    else {
-    ival = dstVal;
+        ikey = dstKey;
-    okey = bufKey;
+        ival = dstVal;
-    oval = bufVal;
+        okey = bufKey;
-  }
+        oval = bufVal;
  printf("Bottom-level sort...\n");
  memcpy(ikey, srcKey, N * sizeof(uint));
  memcpy(ival, srcVal, N * sizeof(uint));
  for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
    bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
               sortDir);
  }
  printf("Merge...\n");
  uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
  uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
  uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
  uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
  memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
  memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
  memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
  memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
  for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
    uint lastSegmentElements = N % (2 * stride);
    // Find sample ranks and prepare for limiters merge
    generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
    // Merge ranks and indices
    mergeRanksAndIndices(limitsA, ranksA, stride, N);
    mergeRanksAndIndices(limitsB, ranksB, stride, N);
    // Merge elementary intervals
    mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
                             N, sortDir);
    if (lastSegmentElements <= stride) {
      // Last merge segment consists of a single array which just needs to be
      // passed through
      memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
             lastSegmentElements * sizeof(uint));
      memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
             lastSegmentElements * sizeof(uint));
    }
-    uint *t;
+    printf("Bottom-level sort...\n");
-    t = ikey;
+    memcpy(ikey, srcKey, N * sizeof(uint));
-    ikey = okey;
+    memcpy(ival, srcVal, N * sizeof(uint));
    okey = t;
    t = ival;
    ival = oval;
    oval = t;
  }
-  free(limitsB);
+    for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
-  free(limitsA);
+        bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
-  free(ranksB);
+    }
-  free(ranksA);
+
    printf("Merge...\n");
    uint *ranksA  = (uint *)malloc(getSampleCount(N) * sizeof(uint));
    uint *ranksB  = (uint *)malloc(getSampleCount(N) * sizeof(uint));
    uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
    uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
    memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
    memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
    memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
    memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
    for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
        uint lastSegmentElements = N % (2 * stride);
        // Find sample ranks and prepare for limiters merge
        generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
        // Merge ranks and indices
        mergeRanksAndIndices(limitsA, ranksA, stride, N);
        mergeRanksAndIndices(limitsB, ranksB, stride, N);
        // Merge elementary intervals
        mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
        if (lastSegmentElements <= stride) {
            // Last merge segment consists of a single array which just needs to be
            // passed through
            memcpy(
                okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
            memcpy(
                oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
        }
        uint *t;
        t    = ikey;
        ikey = okey;
        okey = t;
        t    = ival;
        ival = oval;
        oval = t;
    }
    free(limitsB);
    free(limitsA);
    free(ranksB);
    free(ranksA);
 }
--- a/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
+++ b/Samples/0_Introduction/mergeSort/mergeSort_validate.cpp
@ -29,104 +29,100 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "mergeSort_common.h"
 ////////////////////////////////////////////////////////////////////////////////
 // Validate sorted keys array (check for integrity and proper order)
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
+extern "C" uint
-                                   uint arrayLength, uint numValues,
+validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
-                                   uint sortDir) {
+{
-  uint *srcHist;
+    uint *srcHist;
-  uint *resHist;
+    uint *resHist;
-  if (arrayLength < 2) {
+    if (arrayLength < 2) {
-    printf("validateSortedKeys(): arrays too short, exiting...\n");
+        printf("validateSortedKeys(): arrays too short, exiting...\n");
-    return 1;
+        return 1;
  }
  printf("...inspecting keys array: ");
  srcHist = (uint *)malloc(numValues * sizeof(uint));
  resHist = (uint *)malloc(numValues * sizeof(uint));
  int flag = 1;
  for (uint j = 0; j < batchSize;
       j++, srcKey += arrayLength, resKey += arrayLength) {
    // Build histograms for keys arrays
    memset(srcHist, 0, numValues * sizeof(uint));
    memset(resHist, 0, numValues * sizeof(uint));
    for (uint i = 0; i < arrayLength; i++) {
      if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
        srcHist[srcKey[i]]++;
        resHist[resKey[i]]++;
      } else {
        fprintf(
            stderr,
            "***Set %u source/result key arrays are not limited properly***\n",
            j);
        flag = 0;
        goto brk;
      }
    }
-    // Compare the histograms
+    printf("...inspecting keys array: ");
-    for (uint i = 0; i < numValues; i++)
+    srcHist = (uint *)malloc(numValues * sizeof(uint));
-      if (srcHist[i] != resHist[i]) {
+    resHist = (uint *)malloc(numValues * sizeof(uint));
        fprintf(stderr,
                "***Set %u source/result keys histograms do not match***\n", j);
        flag = 0;
        goto brk;
      }
-    // Finally check the ordering
+    int flag = 1;
-    for (uint i = 0; i < arrayLength - 1; i++)
+
-      if ((sortDir && (resKey[i] > resKey[i + 1])) ||
+    for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
-          (!sortDir && (resKey[i] < resKey[i + 1]))) {
+        // Build histograms for keys arrays
-        fprintf(stderr,
+        memset(srcHist, 0, numValues * sizeof(uint));
-                "***Set %u result key array is not ordered properly***\n", j);
+        memset(resHist, 0, numValues * sizeof(uint));
-        flag = 0;
+
-        goto brk;
+        for (uint i = 0; i < arrayLength; i++) {
-      }
+            if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
-  }
+                srcHist[srcKey[i]]++;
                resHist[resKey[i]]++;
            }
            else {
                fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
                flag = 0;
                goto brk;
            }
        }
        // Compare the histograms
        for (uint i = 0; i < numValues; i++)
            if (srcHist[i] != resHist[i]) {
                fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
                flag = 0;
                goto brk;
            }
        // Finally check the ordering
        for (uint i = 0; i < arrayLength - 1; i++)
            if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
                fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
                flag = 0;
                goto brk;
            }
    }
 brk:
-  free(resHist);
+    free(resHist);
-  free(srcHist);
+    free(srcHist);
-  if (flag) printf("OK\n");
+    if (flag)
        printf("OK\n");
-  return flag;
+    return flag;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Value validation / stability check routines
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" void fillValues(uint *val, uint N) {
+extern "C" void fillValues(uint *val, uint N)
-  for (uint i = 0; i < N; i++) val[i] = i;
+{
    for (uint i = 0; i < N; i++)
        val[i] = i;
 }
-extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
+extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
-                                    uint batchSize, uint arrayLength) {
+{
-  int correctFlag = 1, stableFlag = 1;
+    int correctFlag = 1, stableFlag = 1;
-  printf("...inspecting keys and values array: ");
+    printf("...inspecting keys and values array: ");
-  for (uint i = 0; i < batchSize;
+    for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
-       i++, resKey += arrayLength, resVal += arrayLength) {
+        for (uint j = 0; j < arrayLength; j++) {
-    for (uint j = 0; j < arrayLength; j++) {
+            if (resKey[j] != srcKey[resVal[j]])
-      if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
+                correctFlag = 0;
-      if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
+            if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
-          (resVal[j] > resVal[j + 1]))
+                stableFlag = 0;
-        stableFlag = 0;
+        }
    }
  }
-  printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
+    printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
-  printf(stableFlag ? "...stability property: stable!\n"
+    printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
                    : "...stability property: NOT stable\n");
-  return correctFlag;
+    return correctFlag;
 }
--- a/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
+++ b/Samples/0_Introduction/simpleAWBarrier/simpleAWBarrier.cu
@ -29,106 +29,105 @@
 #include <stdio.h>
 // Includes CUDA
 #include <cuda_runtime.h>
 #include <cuda/barrier>
 #include <cooperative_groups.h>
 #include <cuda/barrier>
 #include <cuda_runtime.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check
 namespace cg = cooperative_groups;
 #if __CUDA_ARCH__ >= 700
 template <bool writeSquareRoot>
-__device__ void reduceBlockData(
+__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
-    cuda::barrier<cuda::thread_scope_block> &barrier,
+                                cg::thread_block_tile<32>               &tile32,
-    cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
+                                double                                  &threadSum,
-  extern __shared__ double tmp[];
+                                double                                  *result)
-
+{
-#pragma unroll
+    extern __shared__ double tmp[];
  for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
    threadSum += tile32.shfl_down(threadSum, offset);
  }
  if (tile32.thread_rank() == 0) {
    tmp[tile32.meta_group_rank()] = threadSum;
  }
  auto token = barrier.arrive();
  barrier.wait(std::move(token));
  // The warp 0 will perform last round of reduction
  if (tile32.meta_group_rank() == 0) {
    double beta = tile32.thread_rank() < tile32.meta_group_size()
                      ? tmp[tile32.thread_rank()]
                      : 0.0;
 #pragma unroll
    for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
-      beta += tile32.shfl_down(beta, offset);
+        threadSum += tile32.shfl_down(threadSum, offset);
    }
    if (tile32.thread_rank() == 0) {
        tmp[tile32.meta_group_rank()] = threadSum;
    }
-    if (tile32.thread_rank() == 0) {
+    auto token = barrier.arrive();
-      if (writeSquareRoot)
+
-        *result = sqrt(beta);
+    barrier.wait(std::move(token));
-      else
+
-        *result = beta;
+    // The warp 0 will perform last round of reduction
    if (tile32.meta_group_rank() == 0) {
        double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
 #pragma unroll
        for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
            beta += tile32.shfl_down(beta, offset);
        }
        if (tile32.thread_rank() == 0) {
            if (writeSquareRoot)
                *result = sqrt(beta);
            else
                *result = beta;
        }
    }
  }
 }
 #endif
-__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
+__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
-                                             double *partialResults, int size) {
+{
 #if __CUDA_ARCH__ >= 700
 #pragma diag_suppress static_var_with_dynamic_init
-  cg::thread_block cta = cg::this_thread_block();
+    cg::thread_block cta  = cg::this_thread_block();
-  cg::grid_group grid = cg::this_grid();
+    cg::grid_group   grid = cg::this_grid();
-  ;
+    ;
-  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+    cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
-  __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
+    __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
-  if (threadIdx.x == 0) {
+    if (threadIdx.x == 0) {
-    init(&barrier, blockDim.x);
+        init(&barrier, blockDim.x);
  }
  cg::sync(cta);
  double threadSum = 0.0;
  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
    threadSum += (double)(vecA[i] * vecB[i]);
  }
  // Each thread block performs reduction of partial dotProducts and writes to
  // global mem.
  reduceBlockData<false>(barrier, tile32, threadSum,
                         &partialResults[blockIdx.x]);
  cg::sync(grid);
  // One block performs the final summation of partial dot products
  // of all the thread blocks and writes the sqrt of final dot product.
  if (blockIdx.x == 0) {
    threadSum = 0.0;
    for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
      threadSum += partialResults[i];
    }
    reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
  }
-  cg::sync(grid);
+    cg::sync(cta);
-  const double finalValue = partialResults[0];
+    double threadSum = 0.0;
    for (int i = grid.thread_rank(); i < size; i += grid.size()) {
        threadSum += (double)(vecA[i] * vecB[i]);
    }
-  // Perform normalization of vecA & vecB.
+    // Each thread block performs reduction of partial dotProducts and writes to
-  for (int i = grid.thread_rank(); i < size; i += grid.size()) {
+    // global mem.
-    vecA[i] = (float)vecA[i] / finalValue;
+    reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
-    vecB[i] = (float)vecB[i] / finalValue;
+
-  }
+    cg::sync(grid);
    // One block performs the final summation of partial dot products
    // of all the thread blocks and writes the sqrt of final dot product.
    if (blockIdx.x == 0) {
        threadSum = 0.0;
        for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
            threadSum += partialResults[i];
        }
        reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
    }
    cg::sync(grid);
    const double finalValue = partialResults[0];
    // Perform normalization of vecA & vecB.
    for (int i = grid.thread_rank(); i < size; i += grid.size()) {
        vecA[i] = (float)vecA[i] / finalValue;
        vecB[i] = (float)vecB[i] / finalValue;
    }
 #endif
 }
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", argv[0]);
+{
    printf("%s starting...\n", argv[0]);
-  // This will pick the best possible CUDA capable device
+    // This will pick the best possible CUDA capable device
-  int dev = findCudaDevice(argc, (const char **)argv);
+    int dev = findCudaDevice(argc, (const char **)argv);
-  int major = 0;
+    int major = 0;
-  checkCudaErrors(
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
-  // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
+    // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
-  if (major < 7) {
+    if (major < 7) {
-    printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
+        printf("simpleAWBarrier requires SM 7.0 or higher.  Exiting...\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
  }
  int supportsCooperativeLaunch = 0;
  checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
                                         cudaDevAttrCooperativeLaunch, dev));
  if (!supportsCooperativeLaunch) {
    printf(
        "\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
        "Waiving the run\n",
        dev);
    exit(EXIT_WAIVED);
  }
  int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
  printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
  float *vecA, *d_vecA;
  float *vecB, *d_vecB;
  double *d_partialResults;
  int size = 10000000;
  checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
  checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
  checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
  float baseVal = 2.0;
  for (int i = 0; i < size; i++) {
    vecA[i] = vecB[i] = baseVal;
  }
  cudaStream_t stream;
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
                                  cudaMemcpyHostToDevice, stream));
  checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
                                  cudaMemcpyHostToDevice, stream));
  // Kernel configuration, where a one-dimensional
  // grid and one-dimensional blocks are configured.
  int minGridSize = 0, blockSize = 0;
  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
      &minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
  int smemSize = ((blockSize / 32) + 1) * sizeof(double);
  int numBlocksPerSm = 0;
  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
      &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
  int multiProcessorCount = 0;
  checkCudaErrors(cudaDeviceGetAttribute(
      &multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
  minGridSize = multiProcessorCount * numBlocksPerSm;
  checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
  printf(
      "Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
      "blockSize = %d\n",
      minGridSize, blockSize);
  dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
  void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
                        (void *)&d_partialResults, (void *)&size};
  checkCudaErrors(
      cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
                                  dimBlock, kernelArgs, smemSize, stream));
  checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
                                  cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
  unsigned int matches = 0;
  for (int i = 0; i < size; i++) {
    if ((vecA[i] - expectedResult) > 0.00001) {
      printf("mismatch at i = %d\n", i);
      break;
    } else {
      matches++;
    }
  }
-  printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
+    int supportsCooperativeLaunch = 0;
-  checkCudaErrors(cudaFree(d_vecA));
+    checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
  checkCudaErrors(cudaFree(d_vecB));
  checkCudaErrors(cudaFree(d_partialResults));
-  checkCudaErrors(cudaFreeHost(vecA));
+    if (!supportsCooperativeLaunch) {
-  checkCudaErrors(cudaFreeHost(vecB));
+        printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
-  return matches == size;
+               "Waiving the run\n",
               dev);
        exit(EXIT_WAIVED);
    }
    int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
    printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
 {
    float  *vecA, *d_vecA;
    float  *vecB, *d_vecB;
    double *d_partialResults;
    int     size = 10000000;
    checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
    checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
    checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
    checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
    float baseVal = 2.0;
    for (int i = 0; i < size; i++) {
        vecA[i] = vecB[i] = baseVal;
    }
    cudaStream_t stream;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
    // Kernel configuration, where a one-dimensional
    // grid and one-dimensional blocks are configured.
    int minGridSize = 0, blockSize = 0;
    checkCudaErrors(
        cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
    int smemSize = ((blockSize / 32) + 1) * sizeof(double);
    int numBlocksPerSm = 0;
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
    int multiProcessorCount = 0;
    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
    minGridSize = multiProcessorCount * numBlocksPerSm;
    checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
    printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
           "blockSize = %d\n",
           minGridSize,
           blockSize);
    dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
    void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
    checkCudaErrors(cudaLaunchCooperativeKernel(
        (void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
    checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    float        expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
    unsigned int matches        = 0;
    for (int i = 0; i < size; i++) {
        if ((vecA[i] - expectedResult) > 0.00001) {
            printf("mismatch at i = %d\n", i);
            break;
        }
        else {
            matches++;
        }
    }
    printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
    checkCudaErrors(cudaFree(d_vecA));
    checkCudaErrors(cudaFree(d_vecB));
    checkCudaErrors(cudaFree(d_partialResults));
    checkCudaErrors(cudaFreeHost(vecA));
    checkCudaErrors(cudaFreeHost(vecB));
    return matches == size;
 }
--- a/Samples/0_Introduction/simpleAssert/simpleAssert.cu
+++ b/Samples/0_Introduction/simpleAssert/simpleAssert.cu
@ -34,17 +34,17 @@
 #endif
 // Includes, system
 #include <stdio.h>
 #include <cassert>
 #include <stdio.h>
 // Includes CUDA
 #include <cuda_runtime.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check
 const char *sampleName = "simpleAssert";
@ -58,9 +58,10 @@ bool testResult = true;
 //! Tests assert function.
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int N) {
+__global__ void testKernel(int N)
-  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  assert(gtid < N);
+    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
    assert(gtid < N);
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", sampleName);
+{
    printf("%s starting...\n", sampleName);
-  runTest(argc, argv);
+    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
-         testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  int Nblocks = 2;
+{
-  int Nthreads = 32;
+    int         Nblocks  = 2;
-  cudaError_t error;
+    int         Nthreads = 32;
    cudaError_t error;
 #ifndef _WIN32
-  utsname OS_System_Type;
+    utsname OS_System_Type;
-  uname(&OS_System_Type);
+    uname(&OS_System_Type);
-  printf("OS_System_Type.release = %s\n", OS_System_Type.release);
+    printf("OS_System_Type.release = %s\n", OS_System_Type.release);
-  if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
+    if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
-    printf("simpleAssert is not current supported on Mac OSX\n\n");
+        printf("simpleAssert is not current supported on Mac OSX\n\n");
-    exit(EXIT_SUCCESS);
+        exit(EXIT_SUCCESS);
-  } else {
+    }
-    printf("OS Info: <%s>\n\n", OS_System_Type.version);
+    else {
-  }
+        printf("OS Info: <%s>\n\n", OS_System_Type.version);
    }
 #endif
-  // This will pick the best possible CUDA capable device
+    // This will pick the best possible CUDA capable device
-  findCudaDevice(argc, (const char **)argv);
+    findCudaDevice(argc, (const char **)argv);
-  // Kernel configuration, where a one-dimensional
+    // Kernel configuration, where a one-dimensional
-  // grid and one-dimensional blocks are configured.
+    // grid and one-dimensional blocks are configured.
-  dim3 dimGrid(Nblocks);
+    dim3 dimGrid(Nblocks);
-  dim3 dimBlock(Nthreads);
+    dim3 dimBlock(Nthreads);
-  printf("Launch kernel to generate assertion failures\n");
+    printf("Launch kernel to generate assertion failures\n");
-  testKernel<<<dimGrid, dimBlock>>>(60);
+    testKernel<<<dimGrid, dimBlock>>>(60);
-  // Synchronize (flushes assert output).
+    // Synchronize (flushes assert output).
-  printf("\n-- Begin assert output\n\n");
+    printf("\n-- Begin assert output\n\n");
-  error = cudaDeviceSynchronize();
+    error = cudaDeviceSynchronize();
-  printf("\n-- End assert output\n\n");
+    printf("\n-- End assert output\n\n");
-  // Check for errors and failed asserts in asynchronous kernel launch.
+    // Check for errors and failed asserts in asynchronous kernel launch.
-  if (error == cudaErrorAssert) {
+    if (error == cudaErrorAssert) {
-    printf(
+        printf("Device assert failed as expected, "
-        "Device assert failed as expected, "
+               "CUDA error message is: %s\n\n",
-        "CUDA error message is: %s\n\n",
+               cudaGetErrorString(error));
-        cudaGetErrorString(error));
+    }
  }
-  testResult = error == cudaErrorAssert;
+    testResult = error == cudaErrorAssert;
 }
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert.cpp
@ -34,15 +34,16 @@
 #endif
 // Includes, system
 #include <stdio.h>
 #include <cassert>
 #include <stdio.h>
 // Includes CUDA
 #include <cuda_runtime.h>
 #include "nvrtc_helper.h"
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 const char *sampleName = "simpleAssert_nvrtc";
@ -58,56 +59,63 @@ void runTest(int argc, char **argv);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", sampleName);
+{
    printf("%s starting...\n", sampleName);
-  runTest(argc, argv);
+    runTest(argc, argv);
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  int Nblocks = 2;
+{
-  int Nthreads = 32;
+    int Nblocks  = 2;
    int Nthreads = 32;
-  // Kernel configuration, where a one-dimensional
+    // Kernel configuration, where a one-dimensional
-  // grid and one-dimensional blocks are configured.
+    // grid and one-dimensional blocks are configured.
-  dim3 dimGrid(Nblocks);
+    dim3 dimGrid(Nblocks);
-  dim3 dimBlock(Nthreads);
+    dim3 dimBlock(Nthreads);
-  printf("Launch kernel to generate assertion failures\n");
+    printf("Launch kernel to generate assertion failures\n");
-  char *cubin, *kernel_file;
+    char  *cubin, *kernel_file;
-  size_t cubinSize;
+    size_t cubinSize;
-  kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
+    kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUfunction kernel_addr;
-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
-  int count = 60;
+    int   count  = 60;
-  void *args[] = {(void *)&count};
+    void *args[] = {(void *)&count};
-  checkCudaErrors(cuLaunchKernel(
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
-      kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
+                                   dimGrid.x,
-      dimBlock.x, dimBlock.y, dimBlock.z,           /* block dim */
+                                   dimGrid.y,
-      0, 0,                                         /* shared mem, stream */
+                                   dimGrid.z, /* grid dim */
-      &args[0],                                     /* arguments */
+                                   dimBlock.x,
-      0));
+                                   dimBlock.y,
                                   dimBlock.z, /* block dim */
                                   0,
                                   0,        /* shared mem, stream */
                                   &args[0], /* arguments */
                                   0));
-  // Synchronize (flushes assert output).
+    // Synchronize (flushes assert output).
-  printf("\n-- Begin assert output\n\n");
+    printf("\n-- Begin assert output\n\n");
-  CUresult res = cuCtxSynchronize();
+    CUresult res = cuCtxSynchronize();
-  printf("\n-- End assert output\n\n");
+    printf("\n-- End assert output\n\n");
-  // Check for errors and failed asserts in asynchronous kernel launch.
+    // Check for errors and failed asserts in asynchronous kernel launch.
-  if (res == CUDA_ERROR_ASSERT) {
+    if (res == CUDA_ERROR_ASSERT) {
-    printf("Device assert failed as expected\n");
+        printf("Device assert failed as expected\n");
-  }
+    }
-  testResult = res == CUDA_ERROR_ASSERT;
+    testResult = res == CUDA_ERROR_ASSERT;
 }
--- a/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
+++ b/Samples/0_Introduction/simpleAssert_nvrtc/simpleAssert_kernel.cu
@ -32,7 +32,8 @@
 //! Thread whose id > N will print assertion failed error message.
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void testKernel(int N) {
+extern "C" __global__ void testKernel(int N)
-  int gtid = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  assert(gtid < N);
+    int gtid = blockIdx.x * blockDim.x + threadIdx.x;
    assert(gtid < N);
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics.cu
@ -30,10 +30,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -45,10 +45,10 @@
 #include <cuda_runtime.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check
 // Includes, kernels
 #include "simpleAtomicIntrinsics_kernel.cuh"
@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", sampleName);
+{
    printf("%s starting...\n", sampleName);
-  runTest(argc, argv);
+    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
-         testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  cudaStream_t stream;
+{
-  // This will pick the best possible CUDA capable device
+    cudaStream_t stream;
-  findCudaDevice(argc, (const char **)argv);
+    // This will pick the best possible CUDA capable device
    findCudaDevice(argc, (const char **)argv);
-  StopWatchInterface *timer;
+    StopWatchInterface *timer;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  unsigned int numThreads = 256;
+    unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
+    unsigned int numBlocks  = 64;
-  unsigned int numData = 11;
+    unsigned int numData    = 11;
-  unsigned int memSize = sizeof(int) * numData;
+    unsigned int memSize    = sizeof(int) * numData;
-  // allocate mem for the result on host side
+    // allocate mem for the result on host side
-  int *hOData;
+    int *hOData;
-  checkCudaErrors(cudaMallocHost(&hOData, memSize));
+    checkCudaErrors(cudaMallocHost(&hOData, memSize));
-  // initialize the memory
+    // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;
-  // To make the AND and XOR tests generate something other than 0...
+    // To make the AND and XOR tests generate something other than 0...
-  hOData[8] = hOData[10] = 0xff;
+    hOData[8] = hOData[10] = 0xff;
-  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  // allocate device memory for result
+    // allocate device memory for result
-  int *dOData;
+    int *dOData;
-  checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
+    checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
-  // copy host memory to device to initialize to zero
+    // copy host memory to device to initialize to zero
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
      cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
-  // execute the kernel
+    // execute the kernel
-  testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
+    testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
-  // Copy result from device to host
+    // Copy result from device to host
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
-      cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaStreamSynchronize(stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkDeleteTimer(&timer);
-  // Compute reference solution
+    // Compute reference solution
-  testResult = computeGold(hOData, numThreads * numBlocks);
+    testResult = computeGold(hOData, numThreads * numBlocks);
-  // Cleanup memory
+    // Cleanup memory
-  checkCudaErrors(cudaFreeHost(hOData));
+    checkCudaErrors(cudaFreeHost(hOData));
-  checkCudaErrors(cudaFree(dOData));
+    checkCudaErrors(cudaFree(dOData));
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_cpu.cpp
@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int computeGold(int *gpuData, const int len) {
+int computeGold(int *gpuData, const int len)
-  int val = 0;
+{
    int val = 0;
-  for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i) {
-    val += 10;
+        val += 10;
  }
  if (val != gpuData[0]) {
    printf("atomicAdd failed\n");
    return false;
  }
  val = 0;
  for (int i = 0; i < len; ++i) {
    val -= 10;
  }
  if (val != gpuData[1]) {
    printf("atomicSub failed\n");
    return false;
  }
  bool found = false;
  for (int i = 0; i < len; ++i) {
    // third element should be a member of [0, len)
    if (i == gpuData[2]) {
      found = true;
      break;
    }
  }
-  if (!found) {
+    if (val != gpuData[0]) {
-    printf("atomicExch failed\n");
+        printf("atomicAdd failed\n");
-    return false;
+        return false;
  }
  val = -(1 << 8);
  for (int i = 0; i < len; ++i) {
    // fourth element should be len-1
    val = max(val, i);
  }
  if (val != gpuData[3]) {
    printf("atomicMax failed\n");
    return false;
  }
  val = 1 << 8;
  for (int i = 0; i < len; ++i) {
    val = min(val, i);
  }
  if (val != gpuData[4]) {
    printf("atomicMin failed\n");
    return false;
  }
  int limit = 17;
  val = 0;
  for (int i = 0; i < len; ++i) {
    val = (val >= limit) ? 0 : val + 1;
  }
  if (val != gpuData[5]) {
    printf("atomicInc failed\n");
    return false;
  }
  limit = 137;
  val = 0;
  for (int i = 0; i < len; ++i) {
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
  }
  if (val != gpuData[6]) {
    printf("atomicDec failed\n");
    return false;
  }
  found = false;
  for (int i = 0; i < len; ++i) {
    // eighth element should be a member of [0, len)
    if (i == gpuData[7]) {
      found = true;
      break;
    }
  }
-  if (!found) {
+    val = 0;
    printf("atomicCAS failed\n");
    return false;
  }
-  val = 0xff;
+    for (int i = 0; i < len; ++i) {
        val -= 10;
    }
-  for (int i = 0; i < len; ++i) {
+    if (val != gpuData[1]) {
-    // 9th element should be 1
+        printf("atomicSub failed\n");
-    val &= (2 * i + 7);
+        return false;
-  }
+    }
-  if (val != gpuData[8]) {
+    bool found = false;
    printf("atomicAnd failed\n");
    return false;
  }
-  val = 0;
+    for (int i = 0; i < len; ++i) {
        // third element should be a member of [0, len)
        if (i == gpuData[2]) {
            found = true;
            break;
        }
    }
-  for (int i = 0; i < len; ++i) {
+    if (!found) {
-    // 10th element should be 0xff
+        printf("atomicExch failed\n");
-    val |= (1 << i);
+        return false;
-  }
+    }
-  if (val != gpuData[9]) {
+    val = -(1 << 8);
    printf("atomicOr failed\n");
    return false;
  }
-  val = 0xff;
+    for (int i = 0; i < len; ++i) {
        // fourth element should be len-1
        val = max(val, i);
    }
-  for (int i = 0; i < len; ++i) {
+    if (val != gpuData[3]) {
-    // 11th element should be 0xff
+        printf("atomicMax failed\n");
-    val ^= i;
+        return false;
-  }
+    }
-  if (val != gpuData[10]) {
+    val = 1 << 8;
    printf("atomicXor failed\n");
    return false;
  }
-  return true;
+    for (int i = 0; i < len; ++i) {
        val = min(val, i);
    }
    if (val != gpuData[4]) {
        printf("atomicMin failed\n");
        return false;
    }
    int limit = 17;
    val       = 0;
    for (int i = 0; i < len; ++i) {
        val = (val >= limit) ? 0 : val + 1;
    }
    if (val != gpuData[5]) {
        printf("atomicInc failed\n");
        return false;
    }
    limit = 137;
    val   = 0;
    for (int i = 0; i < len; ++i) {
        val = ((val == 0) || (val > limit)) ? limit : val - 1;
    }
    if (val != gpuData[6]) {
        printf("atomicDec failed\n");
        return false;
    }
    found = false;
    for (int i = 0; i < len; ++i) {
        // eighth element should be a member of [0, len)
        if (i == gpuData[7]) {
            found = true;
            break;
        }
    }
    if (!found) {
        printf("atomicCAS failed\n");
        return false;
    }
    val = 0xff;
    for (int i = 0; i < len; ++i) {
        // 9th element should be 1
        val &= (2 * i + 7);
    }
    if (val != gpuData[8]) {
        printf("atomicAnd failed\n");
        return false;
    }
    val = 0;
    for (int i = 0; i < len; ++i) {
        // 10th element should be 0xff
        val |= (1 << i);
    }
    if (val != gpuData[9]) {
        printf("atomicOr failed\n");
        return false;
    }
    val = 0xff;
    for (int i = 0; i < len; ++i) {
        // 11th element should be 0xff
        val ^= i;
    }
    if (val != gpuData[10]) {
        printf("atomicXor failed\n");
        return false;
    }
    return true;
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics/simpleAtomicIntrinsics_kernel.cuh
@ -35,48 +35,49 @@
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(int *g_odata) {
+__global__ void testKernel(int *g_odata)
-  // access thread id
+{
-  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    // access thread id
    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // Test various atomic instructions
+    // Test various atomic instructions
-  // Arithmetic atomic instructions
+    // Arithmetic atomic instructions
-  // Atomic addition
+    // Atomic addition
-  atomicAdd(&g_odata[0], 10);
+    atomicAdd(&g_odata[0], 10);
-  // Atomic subtraction (final should be 0)
+    // Atomic subtraction (final should be 0)
-  atomicSub(&g_odata[1], 10);
+    atomicSub(&g_odata[1], 10);
-  // Atomic exchange
+    // Atomic exchange
-  atomicExch(&g_odata[2], tid);
+    atomicExch(&g_odata[2], tid);
-  // Atomic maximum
+    // Atomic maximum
-  atomicMax(&g_odata[3], tid);
+    atomicMax(&g_odata[3], tid);
-  // Atomic minimum
+    // Atomic minimum
-  atomicMin(&g_odata[4], tid);
+    atomicMin(&g_odata[4], tid);
-  // Atomic increment (modulo 17+1)
+    // Atomic increment (modulo 17+1)
-  atomicInc((unsigned int *)&g_odata[5], 17);
+    atomicInc((unsigned int *)&g_odata[5], 17);
-  // Atomic decrement
+    // Atomic decrement
-  atomicDec((unsigned int *)&g_odata[6], 137);
+    atomicDec((unsigned int *)&g_odata[6], 137);
-  // Atomic compare-and-swap
+    // Atomic compare-and-swap
-  atomicCAS(&g_odata[7], tid - 1, tid);
+    atomicCAS(&g_odata[7], tid - 1, tid);
-  // Bitwise atomic instructions
+    // Bitwise atomic instructions
-  // Atomic AND
+    // Atomic AND
-  atomicAnd(&g_odata[8], 2 * tid + 7);
+    atomicAnd(&g_odata[8], 2 * tid + 7);
-  // Atomic OR
+    // Atomic OR
-  atomicOr(&g_odata[9], 1 << tid);
+    atomicOr(&g_odata[9], 1 << tid);
-  // Atomic XOR
+    // Atomic XOR
-  atomicXor(&g_odata[10], tid);
+    atomicXor(&g_odata[10], tid);
 }
-#endif  // #ifndef _SIMPLEATOMICS_KERNEL_H_
+#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics.cpp
@ -30,10 +30,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -46,7 +46,7 @@
 #include <nvrtc_helper.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 const char *sampleName = "simpleAtomicIntrinsics_nvrtc";
@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len);
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", sampleName);
+{
    printf("%s starting...\n", sampleName);
-  runTest(argc, argv);
+    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
         testResult ? "OK" : "ERROR!");
-  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  int dev = 0;
+{
    int dev = 0;
-  char *cubin, *kernel_file;
+    char  *cubin, *kernel_file;
-  size_t cubinSize;
+    size_t cubinSize;
-  kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
+    kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    CUmodule   module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUfunction kernel_addr;
-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
-  StopWatchInterface *timer;
+    StopWatchInterface *timer;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  unsigned int numThreads = 256;
+    unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
+    unsigned int numBlocks  = 64;
-  unsigned int numData = 11;
+    unsigned int numData    = 11;
-  unsigned int memSize = sizeof(int) * numData;
+    unsigned int memSize    = sizeof(int) * numData;
-  // allocate mem for the result on host side
+    // allocate mem for the result on host side
-  int *hOData = (int *)malloc(memSize);
+    int *hOData = (int *)malloc(memSize);
-  // initialize the memory
+    // initialize the memory
-  for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
        hOData[i] = 0;
-  // To make the AND and XOR tests generate something other than 0...
+    // To make the AND and XOR tests generate something other than 0...
-  hOData[8] = hOData[10] = 0xff;
+    hOData[8] = hOData[10] = 0xff;
-  // allocate device memory for result
+    // allocate device memory for result
-  CUdeviceptr dOData;
+    CUdeviceptr dOData;
-  checkCudaErrors(cuMemAlloc(&dOData, memSize));
+    checkCudaErrors(cuMemAlloc(&dOData, memSize));
-  checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
+    checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
-  // execute the kernel
+    // execute the kernel
-  dim3 cudaBlockSize(numThreads, 1, 1);
+    dim3 cudaBlockSize(numThreads, 1, 1);
-  dim3 cudaGridSize(numBlocks, 1, 1);
+    dim3 cudaGridSize(numBlocks, 1, 1);
-  void *arr[] = {(void *)&dOData};
+    void *arr[] = {(void *)&dOData};
-  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
+    checkCudaErrors(cuLaunchKernel(kernel_addr,
-                                 cudaGridSize.z, /* grid dim */
+                                   cudaGridSize.x,
-                                 cudaBlockSize.x, cudaBlockSize.y,
+                                   cudaGridSize.y,
-                                 cudaBlockSize.z, /* block dim */
+                                   cudaGridSize.z, /* grid dim */
-                                 0, 0,            /* shared mem, stream */
+                                   cudaBlockSize.x,
-                                 &arr[0],         /* arguments */
+                                   cudaBlockSize.y,
-                                 0));
+                                   cudaBlockSize.z, /* block dim */
                                   0,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
-  checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());
-  checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
+    checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
-  // Copy result from device to host
+    // Copy result from device to host
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkDeleteTimer(&timer);
-  // Compute reference solution
+    // Compute reference solution
-  testResult = computeGold(hOData, numThreads * numBlocks);
+    testResult = computeGold(hOData, numThreads * numBlocks);
-  // Cleanup memory
+    // Cleanup memory
-  free(hOData);
+    free(hOData);
-  checkCudaErrors(cuMemFree(dOData));
+    checkCudaErrors(cuMemFree(dOData));
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_cpu.cpp
@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len);
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int computeGold(int *gpuData, const int len) {
+int computeGold(int *gpuData, const int len)
-  int val = 0;
+{
    int val = 0;
-  for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i) {
-    val += 10;
+        val += 10;
  }
  if (val != gpuData[0]) {
    printf("atomicAdd failed\n");
    return false;
  }
  val = 0;
  for (int i = 0; i < len; ++i) {
    val -= 10;
  }
  if (val != gpuData[1]) {
    printf("atomicSub failed\n");
    return false;
  }
  bool found = false;
  for (int i = 0; i < len; ++i) {
    // third element should be a member of [0, len)
    if (i == gpuData[2]) {
      found = true;
      break;
    }
  }
-  if (!found) {
+    if (val != gpuData[0]) {
-    printf("atomicExch failed\n");
+        printf("atomicAdd failed\n");
-    return false;
+        return false;
  }
  val = -(1 << 8);
  for (int i = 0; i < len; ++i) {
    // fourth element should be len-1
    val = max(val, i);
  }
  if (val != gpuData[3]) {
    printf("atomicMax failed\n");
    return false;
  }
  val = 1 << 8;
  for (int i = 0; i < len; ++i) {
    val = min(val, i);
  }
  if (val != gpuData[4]) {
    printf("atomicMin failed\n");
    return false;
  }
  int limit = 17;
  val = 0;
  for (int i = 0; i < len; ++i) {
    val = (val >= limit) ? 0 : val + 1;
  }
  if (val != gpuData[5]) {
    printf("atomicInc failed\n");
    return false;
  }
  limit = 137;
  val = 0;
  for (int i = 0; i < len; ++i) {
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
  }
  if (val != gpuData[6]) {
    printf("atomicDec failed\n");
    return false;
  }
  found = false;
  for (int i = 0; i < len; ++i) {
    // eighth element should be a member of [0, len)
    if (i == gpuData[7]) {
      found = true;
      break;
    }
  }
-  if (!found) {
+    val = 0;
    printf("atomicCAS failed\n");
    return false;
  }
-  val = 0xff;
+    for (int i = 0; i < len; ++i) {
-  for (int i = 0; i < len; ++i) {
+        val -= 10;
-    // 9th element should be 1
+    }
    val &= (2 * i + 7);
  }
-  if (val != gpuData[8]) {
+    if (val != gpuData[1]) {
-    printf("atomicAnd failed\n");
+        printf("atomicSub failed\n");
-    return false;
+        return false;
-  }
+    }
-  val = 0;
+    bool found = false;
  for (int i = 0; i < len; ++i) {
    // 10th element should be 0xff
    val |= (1 << i);
  }
-  if (val != gpuData[9]) {
+    for (int i = 0; i < len; ++i) {
-    printf("atomicOr failed\n");
+        // third element should be a member of [0, len)
-    return false;
+        if (i == gpuData[2]) {
-  }
+            found = true;
            break;
        }
    }
-  val = 0xff;
+    if (!found) {
        printf("atomicExch failed\n");
        return false;
    }
-  for (int i = 0; i < len; ++i) {
+    val = -(1 << 8);
    // 11th element should be 0xff
    val ^= i;
  }
-  if (val != gpuData[10]) {
+    for (int i = 0; i < len; ++i) {
-    printf("atomicXor failed\n");
+        // fourth element should be len-1
-    return false;
+        val = max(val, i);
-  }
+    }
-  return true;
+    if (val != gpuData[3]) {
        printf("atomicMax failed\n");
        return false;
    }
    val = 1 << 8;
    for (int i = 0; i < len; ++i) {
        val = min(val, i);
    }
    if (val != gpuData[4]) {
        printf("atomicMin failed\n");
        return false;
    }
    int limit = 17;
    val       = 0;
    for (int i = 0; i < len; ++i) {
        val = (val >= limit) ? 0 : val + 1;
    }
    if (val != gpuData[5]) {
        printf("atomicInc failed\n");
        return false;
    }
    limit = 137;
    val   = 0;
    for (int i = 0; i < len; ++i) {
        val = ((val == 0) || (val > limit)) ? limit : val - 1;
    }
    if (val != gpuData[6]) {
        printf("atomicDec failed\n");
        return false;
    }
    found = false;
    for (int i = 0; i < len; ++i) {
        // eighth element should be a member of [0, len)
        if (i == gpuData[7]) {
            found = true;
            break;
        }
    }
    if (!found) {
        printf("atomicCAS failed\n");
        return false;
    }
    val = 0xff;
    for (int i = 0; i < len; ++i) {
        // 9th element should be 1
        val &= (2 * i + 7);
    }
    if (val != gpuData[8]) {
        printf("atomicAnd failed\n");
        return false;
    }
    val = 0;
    for (int i = 0; i < len; ++i) {
        // 10th element should be 0xff
        val |= (1 << i);
    }
    if (val != gpuData[9]) {
        printf("atomicOr failed\n");
        return false;
    }
    val = 0xff;
    for (int i = 0; i < len; ++i) {
        // 11th element should be 0xff
        val ^= i;
    }
    if (val != gpuData[10]) {
        printf("atomicXor failed\n");
        return false;
    }
    return true;
 }
--- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
+++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/simpleAtomicIntrinsics_kernel.cuh
@ -36,45 +36,46 @@
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void testKernel(int *g_odata) {
+extern "C" __global__ void testKernel(int *g_odata)
-  // access thread id
+{
-  const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+    // access thread id
    const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // Test various atomic instructions
+    // Test various atomic instructions
-  // Arithmetic atomic instructions
+    // Arithmetic atomic instructions
-  // Atomic addition
+    // Atomic addition
-  atomicAdd(&g_odata[0], 10);
+    atomicAdd(&g_odata[0], 10);
-  // Atomic subtraction (final should be 0)
+    // Atomic subtraction (final should be 0)
-  atomicSub(&g_odata[1], 10);
+    atomicSub(&g_odata[1], 10);
-  // Atomic exchange
+    // Atomic exchange
-  atomicExch(&g_odata[2], tid);
+    atomicExch(&g_odata[2], tid);
-  // Atomic maximum
+    // Atomic maximum
-  atomicMax(&g_odata[3], tid);
+    atomicMax(&g_odata[3], tid);
-  // Atomic minimum
+    // Atomic minimum
-  atomicMin(&g_odata[4], tid);
+    atomicMin(&g_odata[4], tid);
-  // Atomic increment (modulo 17+1)
+    // Atomic increment (modulo 17+1)
-  atomicInc((unsigned int *)&g_odata[5], 17);
+    atomicInc((unsigned int *)&g_odata[5], 17);
-  // Atomic decrement
+    // Atomic decrement
-  atomicDec((unsigned int *)&g_odata[6], 137);
+    atomicDec((unsigned int *)&g_odata[6], 137);
-  // Atomic compare-and-swap
+    // Atomic compare-and-swap
-  atomicCAS(&g_odata[7], tid - 1, tid);
+    atomicCAS(&g_odata[7], tid - 1, tid);
-  // Bitwise atomic instructions
+    // Bitwise atomic instructions
-  // Atomic AND
+    // Atomic AND
-  atomicAnd(&g_odata[8], 2 * tid + 7);
+    atomicAnd(&g_odata[8], 2 * tid + 7);
-  // Atomic OR
+    // Atomic OR
-  atomicOr(&g_odata[9], 1 << tid);
+    atomicOr(&g_odata[9], 1 << tid);
-  // Atomic XOR
+    // Atomic XOR
-  atomicXor(&g_odata[10], tid);
+    atomicXor(&g_odata[10], tid);
 }
-#endif  // #ifndef _SIMPLEATOMICS_KERNEL_H_
+#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
--- a/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
+++ b/Samples/0_Introduction/simpleAttributes/simpleAttributes.cu
@ -26,30 +26,31 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes CUDA
 #include <cuda_runtime.h>
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper functions for SDK examples
+#include <helper_functions.h> // helper functions for SDK examples
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
 void runTest(int argc, char **argv);
-cudaAccessPolicyWindow initAccessPolicyWindow(void) {
+cudaAccessPolicyWindow initAccessPolicyWindow(void)
-  cudaAccessPolicyWindow accessPolicyWindow = {0};
+{
-  accessPolicyWindow.base_ptr = (void *)0;
+    cudaAccessPolicyWindow accessPolicyWindow = {0};
-  accessPolicyWindow.num_bytes = 0;
+    accessPolicyWindow.base_ptr               = (void *)0;
-  accessPolicyWindow.hitRatio = 0.f;
+    accessPolicyWindow.num_bytes              = 0;
-  accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
+    accessPolicyWindow.hitRatio               = 0.f;
-  accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
+    accessPolicyWindow.hitProp                = cudaAccessPropertyNormal;
-  return accessPolicyWindow;
+    accessPolicyWindow.missProp               = cudaAccessPropertyStreaming;
    return accessPolicyWindow;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
 //! @param bigDataSize  input bigData size
 //! @param hitcount how many data access are done within block
 ////////////////////////////////////////////////////////////////////////////////
-static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
+static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
-                                            int bigDataSize, int hitCount) {
+{
-  __shared__ unsigned int hit;
+    __shared__ unsigned int hit;
-  int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int                     row    = blockIdx.y * blockDim.y + threadIdx.y;
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+    int                     col    = blockIdx.x * blockDim.x + threadIdx.x;
-  int tID = row * blockDim.y + col;
+    int                     tID    = row * blockDim.y + col;
-  uint32_t psRand = tID;
+    uint32_t                psRand = tID;
-  atomicExch(&hit, 0);
+    atomicExch(&hit, 0);
-  __syncthreads();
+    __syncthreads();
-  while (hit < hitCount) {
+    while (hit < hitCount) {
-    psRand ^= psRand << 13;
+        psRand ^= psRand << 13;
-    psRand ^= psRand >> 17;
+        psRand ^= psRand >> 17;
-    psRand ^= psRand << 5;
+        psRand ^= psRand << 5;
-    int idx = tID - psRand;
+        int idx = tID - psRand;
-    if (idx < 0) {
+        if (idx < 0) {
-      idx = -idx;
+            idx = -idx;
        }
        if ((tID % 2) == 0) {
            data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
        }
        else {
            trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
        }
        atomicAdd(&hit, 1);
    }
    if ((tID % 2) == 0) {
      data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
    } else {
      trash[psRand % bigDataSize] =
          trash[psRand % bigDataSize] + trash[idx % bigDataSize];
    }
    atomicAdd(&hit, 1);
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  bool bTestResult = true;
+{
-  cudaAccessPolicyWindow accessPolicyWindow;
+    bool                   bTestResult = true;
-  cudaDeviceProp deviceProp;
+    cudaAccessPolicyWindow accessPolicyWindow;
-  cudaStreamAttrValue streamAttrValue;
+    cudaDeviceProp         deviceProp;
-  cudaStream_t stream;
+    cudaStreamAttrValue    streamAttrValue;
-  cudaStreamAttrID streamAttrID;
+    cudaStream_t           stream;
-  dim3 threads(32, 32);
+    cudaStreamAttrID       streamAttrID;
-  int *dataDevicePointer;
+    dim3                   threads(32, 32);
-  int *dataHostPointer;
+    int                   *dataDevicePointer;
-  int dataSize;
+    int                   *dataHostPointer;
-  int *bigDataDevicePointer;
+    int                    dataSize;
-  int *bigDataHostPointer;
+    int                   *bigDataDevicePointer;
-  int bigDataSize;
+    int                   *bigDataHostPointer;
-  StopWatchInterface *timer = 0;
+    int                    bigDataSize;
    StopWatchInterface    *timer = 0;
-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);
-  // use command-line specified CUDA device, otherwise use device with highest
+    // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
+    // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+    int devID = findCudaDevice(argc, (const char **)argv);
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  // Get device properties
+    // Get device properties
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-  dim3 blocks(deviceProp.maxGridSize[1], 1);
+    dim3 blocks(deviceProp.maxGridSize[1], 1);
-  // Make sure device the l2 optimization
+    // Make sure device the l2 optimization
-  if (deviceProp.persistingL2CacheMaxSize == 0) {
+    if (deviceProp.persistingL2CacheMaxSize == 0) {
-    printf(
+        printf("Waiving execution as device %d does not support persisting L2 "
-        "Waiving execution as device %d does not support persisting L2 "
+               "Caching\n",
-        "Caching\n",
+               devID);
-        devID);
+        exit(EXIT_WAIVED);
    exit(EXIT_WAIVED);
  }
  // Create stream to assiocate with window
  checkCudaErrors(cudaStreamCreate(&stream));
  // Set the amount of l2 cache that will be persisting to maximum the device
  // can support
  checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
                                     deviceProp.persistingL2CacheMaxSize));
  // Stream attribute to set
  streamAttrID = cudaStreamAttributeAccessPolicyWindow;
  // Default window
  streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
  accessPolicyWindow = initAccessPolicyWindow();
  // Allocate size of both buffers
  bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
  dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
  // Allocate data
  checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
  checkCudaErrors(
      cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
  for (int i = 0; i < bigDataSize; ++i) {
    if (i < dataSize) {
      dataHostPointer[i] = i;
    }
-    bigDataHostPointer[bigDataSize - i - 1] = i;
+    // Create stream to assiocate with window
-  }
+    checkCudaErrors(cudaStreamCreate(&stream));
-  checkCudaErrors(
+    // Set the amount of l2 cache that will be persisting to maximum the device
-      cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
+    // can support
-  checkCudaErrors(
+    checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
      cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
  checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
                                  dataSize * sizeof(int),
                                  cudaMemcpyHostToDevice, stream));
  checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
                                  bigDataSize * sizeof(int),
                                  cudaMemcpyHostToDevice, stream));
-  // Make a window for the buffer of interest
+    // Stream attribute to set
-  accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
+    streamAttrID = cudaStreamAttributeAccessPolicyWindow;
  accessPolicyWindow.num_bytes = dataSize * sizeof(int);
  accessPolicyWindow.hitRatio = 1.f;
  accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
  accessPolicyWindow.missProp = cudaAccessPropertyNormal;
  streamAttrValue.accessPolicyWindow = accessPolicyWindow;
-  // Assign window to stream
+    // Default window
-  checkCudaErrors(
+    streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
-      cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
+    accessPolicyWindow                 = initAccessPolicyWindow();
-  // Demote any previous persisting lines
+    // Allocate size of both buffers
-  checkCudaErrors(cudaCtxResetPersistingL2Cache());
+    bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
    dataSize    = (deviceProp.l2CacheSize / 4) / sizeof(int);
-  checkCudaErrors(cudaStreamSynchronize(stream));
+    // Allocate data
-  kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
+    checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
-      dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
+    checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
-  checkCudaErrors(cudaStreamSynchronize(stream));
+    for (int i = 0; i < bigDataSize; ++i) {
-  // check if kernel execution generated and error
+        if (i < dataSize) {
-  getLastCudaError("Kernel execution failed");
+            dataHostPointer[i] = i;
        }
-  // Free memory
+        bigDataHostPointer[bigDataSize - i - 1] = i;
-  checkCudaErrors(cudaFreeHost(dataHostPointer));
+    }
  checkCudaErrors(cudaFreeHost(bigDataHostPointer));
  checkCudaErrors(cudaFree(dataDevicePointer));
  checkCudaErrors(cudaFree(bigDataDevicePointer));
-  sdkStopTimer(&timer);
+    checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
-  sdkDeleteTimer(&timer);
+    checkCudaErrors(
        cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(
        bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    // Make a window for the buffer of interest
    accessPolicyWindow.base_ptr        = (void *)dataDevicePointer;
    accessPolicyWindow.num_bytes       = dataSize * sizeof(int);
    accessPolicyWindow.hitRatio        = 1.f;
    accessPolicyWindow.hitProp         = cudaAccessPropertyPersisting;
    accessPolicyWindow.missProp        = cudaAccessPropertyNormal;
    streamAttrValue.accessPolicyWindow = accessPolicyWindow;
    // Assign window to stream
    checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
    // Demote any previous persisting lines
    checkCudaErrors(cudaCtxResetPersistingL2Cache());
    checkCudaErrors(cudaStreamSynchronize(stream));
    kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
        dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
    checkCudaErrors(cudaStreamSynchronize(stream));
    // check if kernel execution generated and error
    getLastCudaError("Kernel execution failed");
    // Free memory
    checkCudaErrors(cudaFreeHost(dataHostPointer));
    checkCudaErrors(cudaFreeHost(bigDataHostPointer));
    checkCudaErrors(cudaFree(dataDevicePointer));
    checkCudaErrors(cudaFree(bigDataDevicePointer));
    sdkStopTimer(&timer);
    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleCUDA2GL/README.md
+++ b/Samples/0_Introduction/simpleCUDA2GL/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleCUDA2GL/main.cpp
+++ b/Samples/0_Introduction/simpleCUDA2GL/main.cpp
--- a/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
+++ b/Samples/0_Introduction/simpleCUDA2GL/simpleCUDA2GL.cu
@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
 __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
 // convert floating point rgb color to 8-bit integer
-__device__ int rgbToInt(float r, float g, float b) {
+__device__ int rgbToInt(float r, float g, float b)
-  r = clamp(r, 0.0f, 255.0f);
+{
-  g = clamp(g, 0.0f, 255.0f);
+    r = clamp(r, 0.0f, 255.0f);
-  b = clamp(b, 0.0f, 255.0f);
+    g = clamp(g, 0.0f, 255.0f);
-  return (int(b) << 16) | (int(g) << 8) | int(r);
+    b = clamp(b, 0.0f, 255.0f);
    return (int(b) << 16) | (int(g) << 8) | int(r);
 }
-__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
+__global__ void cudaProcess(unsigned int *g_odata, int imgw)
-  extern __shared__ uchar4 sdata[];
+{
    extern __shared__ uchar4 sdata[];
-  int tx = threadIdx.x;
+    int tx = threadIdx.x;
-  int ty = threadIdx.y;
+    int ty = threadIdx.y;
-  int bw = blockDim.x;
+    int bw = blockDim.x;
-  int bh = blockDim.y;
+    int bh = blockDim.y;
-  int x = blockIdx.x * bw + tx;
+    int x  = blockIdx.x * bw + tx;
-  int y = blockIdx.y * bh + ty;
+    int y  = blockIdx.y * bh + ty;
-  uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
+    uchar4 c4             = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
-  g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
+    g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
 }
-extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
+extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
-                                   unsigned int *g_odata, int imgw) {
+{
-  cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
+    cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
 }
--- a/Samples/0_Introduction/simpleCallback/multithreading.cpp
+++ b/Samples/0_Introduction/simpleCallback/multithreading.cpp
@ -29,115 +29,124 @@
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
-  return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
+{
    return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
 }
 // Wait for thread to finish
-void cutEndThread(CUTThread thread) {
+void cutEndThread(CUTThread thread)
-  WaitForSingleObject(thread, INFINITE);
+{
-  CloseHandle(thread);
+    WaitForSingleObject(thread, INFINITE);
    CloseHandle(thread);
 }
 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num) {
+void cutWaitForThreads(const CUTThread *threads, int num)
-  WaitForMultipleObjects(num, threads, true, INFINITE);
+{
    WaitForMultipleObjects(num, threads, true, INFINITE);
-  for (int i = 0; i < num; i++) {
+    for (int i = 0; i < num; i++) {
-    CloseHandle(threads[i]);
+        CloseHandle(threads[i]);
-  }
+    }
 }
 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount) {
+CUTBarrier cutCreateBarrier(int releaseCount)
-  CUTBarrier barrier;
+{
    CUTBarrier barrier;
-  InitializeCriticalSection(&barrier.criticalSection);
+    InitializeCriticalSection(&barrier.criticalSection);
-  barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
+    barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
-  barrier.count = 0;
+    barrier.count        = 0;
-  barrier.releaseCount = releaseCount;
+    barrier.releaseCount = releaseCount;
-  return barrier;
+    return barrier;
 }
 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier) {
+void cutIncrementBarrier(CUTBarrier *barrier)
-  int myBarrierCount;
+{
-  EnterCriticalSection(&barrier->criticalSection);
+    int myBarrierCount;
-  myBarrierCount = ++barrier->count;
+    EnterCriticalSection(&barrier->criticalSection);
-  LeaveCriticalSection(&barrier->criticalSection);
+    myBarrierCount = ++barrier->count;
    LeaveCriticalSection(&barrier->criticalSection);
-  if (myBarrierCount >= barrier->releaseCount) {
+    if (myBarrierCount >= barrier->releaseCount) {
-    SetEvent(barrier->barrierEvent);
+        SetEvent(barrier->barrierEvent);
-  }
+    }
 }
 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) {
+void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
  WaitForSingleObject(barrier->barrierEvent, INFINITE);
 }
 // Destroy barrier
 void cutDestroyBarrier(CUTBarrier *barrier) {}
 #else
 // Create thread
-CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
+CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
-  pthread_t thread;
+{
-  pthread_create(&thread, NULL, func, data);
+    pthread_t thread;
-  return thread;
+    pthread_create(&thread, NULL, func, data);
    return thread;
 }
 // Wait for thread to finish
 void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
 // Wait for multiple threads
-void cutWaitForThreads(const CUTThread *threads, int num) {
+void cutWaitForThreads(const CUTThread *threads, int num)
-  for (int i = 0; i < num; i++) {
+{
-    cutEndThread(threads[i]);
+    for (int i = 0; i < num; i++) {
-  }
+        cutEndThread(threads[i]);
    }
 }
 // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount) {
+CUTBarrier cutCreateBarrier(int releaseCount)
-  CUTBarrier barrier;
+{
    CUTBarrier barrier;
-  barrier.count = 0;
+    barrier.count        = 0;
-  barrier.releaseCount = releaseCount;
+    barrier.releaseCount = releaseCount;
-  pthread_mutex_init(&barrier.mutex, 0);
+    pthread_mutex_init(&barrier.mutex, 0);
-  pthread_cond_init(&barrier.conditionVariable, 0);
+    pthread_cond_init(&barrier.conditionVariable, 0);
-  return barrier;
+    return barrier;
 }
 // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier) {
+void cutIncrementBarrier(CUTBarrier *barrier)
-  int myBarrierCount;
+{
-  pthread_mutex_lock(&barrier->mutex);
+    int myBarrierCount;
-  myBarrierCount = ++barrier->count;
+    pthread_mutex_lock(&barrier->mutex);
-  pthread_mutex_unlock(&barrier->mutex);
+    myBarrierCount = ++barrier->count;
    pthread_mutex_unlock(&barrier->mutex);
-  if (myBarrierCount >= barrier->releaseCount) {
+    if (myBarrierCount >= barrier->releaseCount) {
-    pthread_cond_signal(&barrier->conditionVariable);
+        pthread_cond_signal(&barrier->conditionVariable);
-  }
+    }
 }
 // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier) {
+void cutWaitForBarrier(CUTBarrier *barrier)
-  pthread_mutex_lock(&barrier->mutex);
+{
    pthread_mutex_lock(&barrier->mutex);
-  while (barrier->count < barrier->releaseCount) {
+    while (barrier->count < barrier->releaseCount) {
-    pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
+        pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
-  }
+    }
-  pthread_mutex_unlock(&barrier->mutex);
+    pthread_mutex_unlock(&barrier->mutex);
 }
 // Destroy barrier
-void cutDestroyBarrier(CUTBarrier *barrier) {
+void cutDestroyBarrier(CUTBarrier *barrier)
-  pthread_mutex_destroy(&barrier->mutex);
+{
-  pthread_cond_destroy(&barrier->conditionVariable);
+    pthread_mutex_destroy(&barrier->mutex);
    pthread_cond_destroy(&barrier->conditionVariable);
 }
 #endif
--- a/Samples/0_Introduction/simpleCallback/multithreading.h
+++ b/Samples/0_Introduction/simpleCallback/multithreading.h
@ -37,15 +37,16 @@
 typedef HANDLE CUTThread;
 typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
-struct CUTBarrier {
+struct CUTBarrier
-  CRITICAL_SECTION criticalSection;
+{
-  HANDLE barrierEvent;
+    CRITICAL_SECTION criticalSection;
-  int releaseCount;
+    HANDLE           barrierEvent;
-  int count;
+    int              releaseCount;
    int              count;
 };
 #define CUT_THREADPROC unsigned WINAPI
-#define CUT_THREADEND return 0
+#define CUT_THREADEND  return 0
 #else
 // POSIX threads.
@ -55,44 +56,46 @@ typedef pthread_t CUTThread;
 typedef void *(*CUT_THREADROUTINE)(void *);
 #define CUT_THREADPROC void *
-#define CUT_THREADEND return 0
+#define CUT_THREADEND  return 0
-struct CUTBarrier {
+struct CUTBarrier
-  pthread_mutex_t mutex;
+{
-  pthread_cond_t conditionVariable;
+    pthread_mutex_t mutex;
-  int releaseCount;
+    pthread_cond_t  conditionVariable;
-  int count;
+    int             releaseCount;
    int             count;
 };
 #endif
 #ifdef __cplusplus
-extern "C" {
+extern "C"
 {
 #endif
-// Create thread.
+    // Create thread.
-CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
+    CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
-// Wait for thread to finish.
+    // Wait for thread to finish.
-void cutEndThread(CUTThread thread);
+    void cutEndThread(CUTThread thread);
-// Wait for multiple threads.
+    // Wait for multiple threads.
-void cutWaitForThreads(const CUTThread *threads, int num);
+    void cutWaitForThreads(const CUTThread *threads, int num);
-// Create barrier.
+    // Create barrier.
-CUTBarrier cutCreateBarrier(int releaseCount);
+    CUTBarrier cutCreateBarrier(int releaseCount);
-// Increment barrier. (execution continues)
+    // Increment barrier. (execution continues)
-void cutIncrementBarrier(CUTBarrier *barrier);
+    void cutIncrementBarrier(CUTBarrier *barrier);
-// Wait for barrier release.
+    // Wait for barrier release.
-void cutWaitForBarrier(CUTBarrier *barrier);
+    void cutWaitForBarrier(CUTBarrier *barrier);
-// Destroy barrier
+    // Destroy barrier
-void cutDestroyBarrier(CUTBarrier *barrier);
+    void cutDestroyBarrier(CUTBarrier *barrier);
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
-#endif  // MULTITHREADING_H
+#endif // MULTITHREADING_H
--- a/Samples/0_Introduction/simpleCallback/simpleCallback.cu
+++ b/Samples/0_Introduction/simpleCallback/simpleCallback.cu
@ -43,172 +43,173 @@
 #include <stdio.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #include "multithreading.h"
-const int N_workloads = 8;
+const int N_workloads             = 8;
 const int N_elements_per_workload = 100000;
 CUTBarrier thread_barrier;
-void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
+void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
                                void *data);
-struct heterogeneous_workload {
+struct heterogeneous_workload
-  int id;
+{
-  int cudaDeviceID;
+    int id;
    int cudaDeviceID;
-  int *h_data;
+    int         *h_data;
-  int *d_data;
+    int         *d_data;
-  cudaStream_t stream;
+    cudaStream_t stream;
-  bool success;
+    bool success;
 };
-__global__ void incKernel(int *data, int N) {
+__global__ void incKernel(int *data, int N)
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
+{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i < N) data[i]++;
+    if (i < N)
        data[i]++;
 }
-CUT_THREADPROC launch(void *void_arg) {
+CUT_THREADPROC launch(void *void_arg)
-  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
+{
    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
-  // Select GPU for this CPU thread
+    // Select GPU for this CPU thread
-  checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
+    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
-  // Allocate Resources
+    // Allocate Resources
-  checkCudaErrors(cudaStreamCreate(&workload->stream));
+    checkCudaErrors(cudaStreamCreate(&workload->stream));
-  checkCudaErrors(
+    checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
-      cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
+    checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
  checkCudaErrors(cudaHostAlloc(&workload->h_data,
                                N_elements_per_workload * sizeof(int),
                                cudaHostAllocPortable));
-  // CPU thread generates data
+    // CPU thread generates data
-  for (int i = 0; i < N_elements_per_workload; ++i) {
+    for (int i = 0; i < N_elements_per_workload; ++i) {
-    workload->h_data[i] = workload->id + i;
+        workload->h_data[i] = workload->id + i;
  }
  // Schedule work for GPU in CUDA stream without blocking the CPU thread
  // Note: Dedicated streams enable concurrent execution of workloads on the GPU
  dim3 block(512);
  dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
  checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
                                  N_elements_per_workload * sizeof(int),
                                  cudaMemcpyHostToDevice, workload->stream));
  incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
                                                  N_elements_per_workload);
  checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
                                  N_elements_per_workload * sizeof(int),
                                  cudaMemcpyDeviceToHost, workload->stream));
  // New in CUDA 5.0: Add a CPU callback which is called once all currently
  // pending operations in the CUDA stream have finished
  checkCudaErrors(
      cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
  CUT_THREADEND;
  // CPU thread end of life, GPU continues to process data...
 }
 CUT_THREADPROC postprocess(void *void_arg) {
  heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
  // ... GPU is done with processing, continue on new CPU thread...
  // Select GPU for this CPU thread
  checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
  // CPU thread consumes results from GPU
  workload->success = true;
  for (int i = 0; i < N_workloads; ++i) {
    workload->success &= workload->h_data[i] == i + workload->id + 1;
  }
  // Free Resources
  checkCudaErrors(cudaFree(workload->d_data));
  checkCudaErrors(cudaFreeHost(workload->h_data));
  checkCudaErrors(cudaStreamDestroy(workload->stream));
  // Signal the end of the heterogeneous workload to main thread
  cutIncrementBarrier(&thread_barrier);
  CUT_THREADEND;
 }
 void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
                                void *data) {
  // Check status of GPU after stream operations are done
  checkCudaErrors(status);
  // Spawn new CPU worker thread and continue processing on the CPU
  cutStartThread(postprocess, data);
 }
 int main(int argc, char **argv) {
  int N_gpus, max_gpus = 0;
  int gpuInfo[32];  // assume a maximum of 32 GPUs in a system configuration
  printf("Starting simpleCallback\n");
  checkCudaErrors(cudaGetDeviceCount(&N_gpus));
  printf("Found %d CUDA capable GPUs\n", N_gpus);
  if (N_gpus > 32) {
    printf("simpleCallback only supports 32 GPU(s)\n");
  }
  for (int devid = 0; devid < N_gpus; devid++) {
    int SMversion;
    cudaDeviceProp deviceProp;
    cudaSetDevice(devid);
    cudaGetDeviceProperties(&deviceProp, devid);
    SMversion = deviceProp.major << 4 + deviceProp.minor;
    printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
           deviceProp.major, deviceProp.minor);
    printf(", %s GPU Callback Functions\n",
           (SMversion >= 0x11) ? "capable" : "NOT capable");
    if (SMversion >= 0x11) {
      gpuInfo[max_gpus++] = devid;
    }
  }
-  printf("%d GPUs available to run Callback Functions\n", max_gpus);
+    // Schedule work for GPU in CUDA stream without blocking the CPU thread
    // Note: Dedicated streams enable concurrent execution of workloads on the GPU
    dim3 block(512);
    dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
-  heterogeneous_workload *workloads;
+    checkCudaErrors(cudaMemcpyAsync(workload->d_data,
-  workloads = (heterogeneous_workload *)malloc(N_workloads *
+                                    workload->h_data,
-                                               sizeof(heterogeneous_workload));
+                                    N_elements_per_workload * sizeof(int),
-  ;
+                                    cudaMemcpyHostToDevice,
-  thread_barrier = cutCreateBarrier(N_workloads);
+                                    workload->stream));
    incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
    checkCudaErrors(cudaMemcpyAsync(workload->h_data,
                                    workload->d_data,
                                    N_elements_per_workload * sizeof(int),
                                    cudaMemcpyDeviceToHost,
                                    workload->stream));
-  // Main thread spawns a CPU worker thread for each heterogeneous workload
+    // New in CUDA 5.0: Add a CPU callback which is called once all currently
-  printf("Starting %d heterogeneous computing workloads\n", N_workloads);
+    // pending operations in the CUDA stream have finished
    checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
-  for (int i = 0; i < N_workloads; ++i) {
+    CUT_THREADEND;
-    workloads[i].id = i;
+    // CPU thread end of life, GPU continues to process data...
-    workloads[i].cudaDeviceID = gpuInfo[i % max_gpus];  // i % N_gpus;
+}
-
+
-    cutStartThread(launch, &workloads[i]);
+CUT_THREADPROC postprocess(void *void_arg)
-  }
+{
-
+    heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
-  // Sleep until all workloads have finished
+    // ... GPU is done with processing, continue on new CPU thread...
-  cutWaitForBarrier(&thread_barrier);
+
-  printf("Total of %d workloads finished:\n", N_workloads);
+    // Select GPU for this CPU thread
-
+    checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
-  bool success = true;
+
-
+    // CPU thread consumes results from GPU
-  for (int i = 0; i < N_workloads; ++i) {
+    workload->success = true;
-    success &= workloads[i].success;
+
-  }
+    for (int i = 0; i < N_workloads; ++i) {
-
+        workload->success &= workload->h_data[i] == i + workload->id + 1;
-  printf("%s\n", success ? "Success" : "Failure");
+    }
-
+
-  free(workloads);
+    // Free Resources
-
+    checkCudaErrors(cudaFree(workload->d_data));
-  exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
+    checkCudaErrors(cudaFreeHost(workload->h_data));
    checkCudaErrors(cudaStreamDestroy(workload->stream));
    // Signal the end of the heterogeneous workload to main thread
    cutIncrementBarrier(&thread_barrier);
    CUT_THREADEND;
 }
 void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
 {
    // Check status of GPU after stream operations are done
    checkCudaErrors(status);
    // Spawn new CPU worker thread and continue processing on the CPU
    cutStartThread(postprocess, data);
 }
 int main(int argc, char **argv)
 {
    int N_gpus, max_gpus = 0;
    int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
    printf("Starting simpleCallback\n");
    checkCudaErrors(cudaGetDeviceCount(&N_gpus));
    printf("Found %d CUDA capable GPUs\n", N_gpus);
    if (N_gpus > 32) {
        printf("simpleCallback only supports 32 GPU(s)\n");
    }
    for (int devid = 0; devid < N_gpus; devid++) {
        int            SMversion;
        cudaDeviceProp deviceProp;
        cudaSetDevice(devid);
        cudaGetDeviceProperties(&deviceProp, devid);
        SMversion = deviceProp.major << 4 + deviceProp.minor;
        printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
        printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
        if (SMversion >= 0x11) {
            gpuInfo[max_gpus++] = devid;
        }
    }
    printf("%d GPUs available to run Callback Functions\n", max_gpus);
    heterogeneous_workload *workloads;
    workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
    ;
    thread_barrier = cutCreateBarrier(N_workloads);
    // Main thread spawns a CPU worker thread for each heterogeneous workload
    printf("Starting %d heterogeneous computing workloads\n", N_workloads);
    for (int i = 0; i < N_workloads; ++i) {
        workloads[i].id           = i;
        workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
        cutStartThread(launch, &workloads[i]);
    }
    // Sleep until all workloads have finished
    cutWaitForBarrier(&thread_barrier);
    printf("Total of %d workloads finished:\n", N_workloads);
    bool success = true;
    for (int i = 0; i < N_workloads; ++i) {
        success &= workloads[i].success;
    }
    printf("%s\n", success ? "Success" : "Failure");
    free(workloads);
    exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
+++ b/Samples/0_Introduction/simpleCooperativeGroups/simpleCooperativeGroups.cu
@ -38,8 +38,8 @@
 *
 */
 #include <stdio.h>
 #include <cooperative_groups.h>
 #include <stdio.h>
 using namespace cooperative_groups;
@ -49,35 +49,36 @@ using namespace cooperative_groups;
 * calculates the sum of val across the group g. The workspace array, x,
 * must be large enough to contain g.size() integers.
 */
-__device__ int sumReduction(thread_group g, int *x, int val) {
+__device__ int sumReduction(thread_group g, int *x, int val)
-  // rank of this thread in the group
+{
-  int lane = g.thread_rank();
+    // rank of this thread in the group
    int lane = g.thread_rank();
-  // for each iteration of this loop, the number of threads active in the
+    // for each iteration of this loop, the number of threads active in the
-  // reduction, i, is halved, and each active thread (with index [lane])
+    // reduction, i, is halved, and each active thread (with index [lane])
-  // performs a single summation of it's own value with that
+    // performs a single summation of it's own value with that
-  // of a "partner" (with index [lane+i]).
+    // of a "partner" (with index [lane+i]).
-  for (int i = g.size() / 2; i > 0; i /= 2) {
+    for (int i = g.size() / 2; i > 0; i /= 2) {
-    // store value for this thread in temporary array
+        // store value for this thread in temporary array
-    x[lane] = val;
+        x[lane] = val;
-    // synchronize all threads in group
+        // synchronize all threads in group
-    g.sync();
+        g.sync();
-    if (lane < i)
+        if (lane < i)
-      // active threads perform summation of their value with
+            // active threads perform summation of their value with
-      // their partner's value
+            // their partner's value
-      val += x[lane + i];
+            val += x[lane + i];
-    // synchronize all threads in group
+        // synchronize all threads in group
-    g.sync();
+        g.sync();
-  }
+    }
-  // master thread in group returns result, and others return -1.
+    // master thread in group returns result, and others return -1.
-  if (g.thread_rank() == 0)
+    if (g.thread_rank() == 0)
-    return val;
+        return val;
-  else
+    else
-    return -1;
+        return -1;
 }
 /**
@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
 *
 * Creates cooperative groups and performs reductions
 */
-__global__ void cgkernel() {
+__global__ void cgkernel()
-  // threadBlockGroup includes all threads in the block
+{
-  thread_block threadBlockGroup = this_thread_block();
+    // threadBlockGroup includes all threads in the block
-  int threadBlockGroupSize = threadBlockGroup.size();
+    thread_block threadBlockGroup     = this_thread_block();
    int          threadBlockGroupSize = threadBlockGroup.size();
-  // workspace array in shared memory required for reduction
+    // workspace array in shared memory required for reduction
-  extern __shared__ int workspace[];
+    extern __shared__ int workspace[];
-  int input, output, expectedOutput;
+    int input, output, expectedOutput;
-  // input to reduction, for each thread, is its' rank in the group
+    // input to reduction, for each thread, is its' rank in the group
-  input = threadBlockGroup.thread_rank();
+    input = threadBlockGroup.thread_rank();
-  // expected output from analytical formula (n-1)(n)/2
+    // expected output from analytical formula (n-1)(n)/2
-  // (noting that indexing starts at 0 rather than 1)
+    // (noting that indexing starts at 0 rather than 1)
-  expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
+    expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
-  // perform reduction
+    // perform reduction
-  output = sumReduction(threadBlockGroup, workspace, input);
+    output = sumReduction(threadBlockGroup, workspace, input);
-  // master thread in group prints out result
+    // master thread in group prints out result
-  if (threadBlockGroup.thread_rank() == 0) {
+    if (threadBlockGroup.thread_rank() == 0) {
-    printf(
+        printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
-        " Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
+               (int)threadBlockGroup.size() - 1,
-        (int)threadBlockGroup.size() - 1, output, expectedOutput);
+               output,
               expectedOutput);
-    printf(" Now creating %d groups, each of size 16 threads:\n\n",
+        printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
-           (int)threadBlockGroup.size() / 16);
+    }
  }
-  threadBlockGroup.sync();
+    threadBlockGroup.sync();
-  // each tiledPartition16 group includes 16 threads
+    // each tiledPartition16 group includes 16 threads
-  thread_block_tile<16> tiledPartition16 =
+    thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
      tiled_partition<16>(threadBlockGroup);
-  // This offset allows each group to have its own unique area in the workspace
+    // This offset allows each group to have its own unique area in the workspace
-  // array
+    // array
-  int workspaceOffset =
+    int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
      threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
-  // input to reduction, for each thread, is its' rank in the group
+    // input to reduction, for each thread, is its' rank in the group
-  input = tiledPartition16.thread_rank();
+    input = tiledPartition16.thread_rank();
-  // expected output from analytical formula (n-1)(n)/2
+    // expected output from analytical formula (n-1)(n)/2
-  // (noting that indexing starts at 0 rather than 1)
+    // (noting that indexing starts at 0 rather than 1)
-  expectedOutput = 15 * 16 / 2;
+    expectedOutput = 15 * 16 / 2;
-  // Perform reduction
+    // Perform reduction
-  output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
+    output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
-  // each master thread prints out result
+    // each master thread prints out result
-  if (tiledPartition16.thread_rank() == 0)
+    if (tiledPartition16.thread_rank() == 0)
-    printf(
+        printf("   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
-        "   Sum of all ranks 0..15 in this tiledPartition16 group is %d "
+               "(expected %d)\n",
-        "(expected %d)\n",
+               output,
-        output, expectedOutput);
+               expectedOutput);
-  return;
+    return;
 }
 /**
 * Host main routine
 */
-int main() {
+int main()
-  // Error code to check return values for CUDA calls
+{
-  cudaError_t err;
+    // Error code to check return values for CUDA calls
    cudaError_t err;
-  // Launch the kernel
+    // Launch the kernel
-  int blocksPerGrid = 1;
+    int blocksPerGrid   = 1;
-  int threadsPerBlock = 64;
+    int threadsPerBlock = 64;
-  printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
+    printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
-  // we use the optional third argument to specify the size
+    // we use the optional third argument to specify the size
-  // of shared memory required in the kernel
+    // of shared memory required in the kernel
-  cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
+    cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
-  err = cudaDeviceSynchronize();
+    err = cudaDeviceSynchronize();
-  if (err != cudaSuccess) {
+    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
+        fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
-            cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  printf("\n...Done.\n\n");
+    printf("\n...Done.\n\n");
-  return 0;
+    return 0;
 }
--- a/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
+++ b/Samples/0_Introduction/simpleCubemapTexture/simpleCubemapTexture.cu
@ -26,27 +26,27 @@
 */
 /*
-* This sample demonstrates how to use texture fetches from layered 2D textures
+ * This sample demonstrates how to use texture fetches from layered 2D textures
-* in CUDA C
+ * in CUDA C
-*
+ *
-* This sample first generates a 3D input data array for the layered texture
+ * This sample first generates a 3D input data array for the layered texture
-* and the expected output. Then it starts CUDA C kernels, one for each layer,
+ * and the expected output. Then it starts CUDA C kernels, one for each layer,
-* which fetch their layer's texture data (using normalized texture coordinates)
+ * which fetch their layer's texture data (using normalized texture coordinates)
-* transform it to the expected output, and write it to a 3D output data array.
+ * transform it to the expected output, and write it to a 3D output data array.
-*/
+ */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes CUDA
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 static const char *sSDKname = "simpleCubemapTexture";
@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture";
 //! Transform a cubemap face of a linear buffe using cubemap texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width,
+__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
-                                cudaTextureObject_t tex) {
+{
-  // calculate this thread's data point
+    // calculate this thread's data point
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-  // 0.5f offset and division are necessary to access the original data points
+    // 0.5f offset and division are necessary to access the original data points
-  // in the texture (such that bilinear interpolation will not be activated).
+    // in the texture (such that bilinear interpolation will not be activated).
-  // For details, see also CUDA Programming Guide, Appendix D
+    // For details, see also CUDA Programming Guide, Appendix D
-  float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
+    float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
-  float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
+    float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
-  float cx, cy, cz;
+    float cx, cy, cz;
-  for (unsigned int face = 0; face < 6; face++) {
+    for (unsigned int face = 0; face < 6; face++) {
-    // Layer 0 is positive X face
+        // Layer 0 is positive X face
-    if (face == 0) {
+        if (face == 0) {
-      cx = 1;
+            cx = 1;
-      cy = -v;
+            cy = -v;
-      cz = -u;
+            cz = -u;
-    }
+        }
-    // Layer 1 is negative X face
+        // Layer 1 is negative X face
-    else if (face == 1) {
+        else if (face == 1) {
-      cx = -1;
+            cx = -1;
-      cy = -v;
+            cy = -v;
-      cz = u;
+            cz = u;
-    }
+        }
-    // Layer 2 is positive Y face
+        // Layer 2 is positive Y face
-    else if (face == 2) {
+        else if (face == 2) {
-      cx = u;
+            cx = u;
-      cy = 1;
+            cy = 1;
-      cz = v;
+            cz = v;
-    }
+        }
-    // Layer 3 is negative Y face
+        // Layer 3 is negative Y face
-    else if (face == 3) {
+        else if (face == 3) {
-      cx = u;
+            cx = u;
-      cy = -1;
+            cy = -1;
-      cz = -v;
+            cz = -v;
-    }
+        }
-    // Layer 4 is positive Z face
+        // Layer 4 is positive Z face
-    else if (face == 4) {
+        else if (face == 4) {
-      cx = u;
+            cx = u;
-      cy = -v;
+            cy = -v;
-      cz = 1;
+            cz = 1;
-    }
+        }
-    // Layer 4 is negative Z face
+        // Layer 4 is negative Z face
-    else if (face == 5) {
+        else if (face == 5) {
-      cx = -u;
+            cx = -u;
-      cy = -v;
+            cy = -v;
-      cz = -1;
+            cz = -1;
-    }
+        }
-    // read from texture, do expected transformation and write to global memory
+        // read from texture, do expected transformation and write to global memory
-    g_odata[face * width * width + y * width + x] =
+        g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
-        -texCubemap<float>(tex, cx, cy, cz);
+    }
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  // use command-line specified CUDA device, otherwise use device with highest
+{
-  // Gflops/s
+    // use command-line specified CUDA device, otherwise use device with highest
-  int devID = findCudaDevice(argc, (const char **)argv);
+    // Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);
-  bool bResult = true;
+    bool bResult = true;
-  // get number of SMs on this GPU
+    // get number of SMs on this GPU
-  cudaDeviceProp deviceProps;
+    cudaDeviceProp deviceProps;
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
+    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
-         deviceProps.multiProcessorCount);
+    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
-  if (deviceProps.major < 2) {
+    if (deviceProps.major < 2) {
-    printf(
+        printf("%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
-        "%s requires SM 2.0 or higher for support of Texture Arrays.  Test "
+               "will exit... \n",
-        "will exit... \n",
+               sSDKname);
        sSDKname);
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
  }
  // generate input data for layered texture
  unsigned int width = 64, num_faces = 6, num_layers = 1;
  unsigned int cubemap_size = width * width * num_faces;
  unsigned int size = cubemap_size * num_layers * sizeof(float);
  float *h_data = (float *)malloc(size);
  for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
    h_data[i] = (float)i;
  }
  // this is the expected transformation of the input data (the expected output)
  float *h_data_ref = (float *)malloc(size);
  for (unsigned int layer = 0; layer < num_layers; layer++) {
    for (int i = 0; i < (int)(cubemap_size); i++) {
      h_data_ref[layer * cubemap_size + i] =
          -h_data[layer * cubemap_size + i] + layer;
    }
  }
-  // allocate device memory for result
+    // generate input data for layered texture
-  float *d_data = NULL;
+    unsigned int width = 64, num_faces = 6, num_layers = 1;
-  checkCudaErrors(cudaMalloc((void **)&d_data, size));
+    unsigned int cubemap_size = width * width * num_faces;
    unsigned int size         = cubemap_size * num_layers * sizeof(float);
    float       *h_data       = (float *)malloc(size);
-  // allocate array and copy image data
+    for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
-  cudaChannelFormatDesc channelDesc =
+        h_data[i] = (float)i;
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    }
  cudaArray *cu_3darray;
  //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
  //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
                                    make_cudaExtent(width, width, num_faces),
                                    cudaArrayCubemap));
  cudaMemcpy3DParms myparms = {0};
  myparms.srcPos = make_cudaPos(0, 0, 0);
  myparms.dstPos = make_cudaPos(0, 0, 0);
  myparms.srcPtr =
      make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
  myparms.dstArray = cu_3darray;
  myparms.extent = make_cudaExtent(width, width, num_faces);
  myparms.kind = cudaMemcpyHostToDevice;
  checkCudaErrors(cudaMemcpy3D(&myparms));
-  cudaTextureObject_t tex;
+    // this is the expected transformation of the input data (the expected output)
-  cudaResourceDesc texRes;
+    float *h_data_ref = (float *)malloc(size);
  memset(&texRes, 0, sizeof(cudaResourceDesc));
-  texRes.resType = cudaResourceTypeArray;
+    for (unsigned int layer = 0; layer < num_layers; layer++) {
-  texRes.res.array.array = cu_3darray;
+        for (int i = 0; i < (int)(cubemap_size); i++) {
            h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
        }
    }
-  cudaTextureDesc texDescr;
+    // allocate device memory for result
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    float *d_data = NULL;
    checkCudaErrors(cudaMalloc((void **)&d_data, size));
-  texDescr.normalizedCoords = true;
+    // allocate array and copy image data
-  texDescr.filterMode = cudaFilterModeLinear;
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-  texDescr.addressMode[0] = cudaAddressModeWrap;
+    cudaArray            *cu_3darray;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
+    //    checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
-  texDescr.addressMode[2] = cudaAddressModeWrap;
+    //    make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
-  texDescr.readMode = cudaReadModeElementType;
+    checkCudaErrors(
        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos            = make_cudaPos(0, 0, 0);
    myparms.dstPos            = make_cudaPos(0, 0, 0);
    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
    myparms.dstArray          = cu_3darray;
    myparms.extent            = make_cudaExtent(width, width, num_faces);
    myparms.kind              = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&myparms));
-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    cudaTextureObject_t tex;
    cudaResourceDesc    texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));
-  dim3 dimBlock(8, 8, 1);
+    texRes.resType         = cudaResourceTypeArray;
-  dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
+    texRes.res.array.array = cu_3darray;
-  printf(
+    cudaTextureDesc texDescr;
-      "Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
      "block has 8 x 8 threads\n",
      width, num_layers, dimGrid.x, dimGrid.y);
-  transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
+    texDescr.normalizedCoords = true;
-                                         tex);  // warmup (for better timing)
+    texDescr.filterMode       = cudaFilterModeLinear;
    texDescr.addressMode[0]   = cudaAddressModeWrap;
    texDescr.addressMode[1]   = cudaAddressModeWrap;
    texDescr.addressMode[2]   = cudaAddressModeWrap;
    texDescr.readMode         = cudaReadModeElementType;
-  // check if kernel execution generated an error
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
  getLastCudaError("warmup Kernel execution failed");
-  checkCudaErrors(cudaDeviceSynchronize());
+    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
-  StopWatchInterface *timer = NULL;
+    printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
-  sdkCreateTimer(&timer);
+           "block has 8 x 8 threads\n",
-  sdkStartTimer(&timer);
+           width,
           num_layers,
           dimGrid.x,
           dimGrid.y);
-  // execute the kernel
+    transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
-  transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
+                                           tex); // warmup (for better timing)
-  // check if kernel execution generated an error
+    // check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
+    getLastCudaError("warmup Kernel execution failed");
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
  sdkStopTimer(&timer);
  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
  printf("%.2f Mtexlookups/sec\n",
         (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
  sdkDeleteTimer(&timer);
-  // allocate mem for the result on host side
+    StopWatchInterface *timer = NULL;
-  float *h_odata = (float *)malloc(size);
+    sdkCreateTimer(&timer);
-  // copy result from device to host
+    sdkStartTimer(&timer);
  checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
-  // write regression file if necessary
+    // execute the kernel
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
-    // write file for regression test
+
-    sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
+    // check if kernel execution generated an error
-                        false);
+    getLastCudaError("Kernel execution failed");
-  } else {
+
-    printf("Comparing kernel output to expected data\n");
+    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
    printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);
    // allocate mem for the result on host side
    float *h_odata = (float *)malloc(size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
    // write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
        sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
    }
    else {
        printf("Comparing kernel output to expected data\n");
 #define MIN_EPSILON_ERROR 5e-3f
-    bResult =
+        bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
-        compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
+    }
  }
-  // cleanup memory
+    // cleanup memory
-  free(h_data);
+    free(h_data);
-  free(h_data_ref);
+    free(h_data_ref);
-  free(h_odata);
+    free(h_odata);
-  checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(d_data));
+    checkCudaErrors(cudaFree(d_data));
-  checkCudaErrors(cudaFreeArray(cu_3darray));
+    checkCudaErrors(cudaFreeArray(cu_3darray));
-  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
+++ b/Samples/0_Introduction/simpleDrvRuntime/simpleDrvRuntime.cpp
@ -33,12 +33,12 @@
 */
 // Includes
 #include <cstring>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <iostream>
 #include <stdio.h>
 #include <string.h>
 #include <cstring>
 #include <iostream>
 // includes, project
 #include <helper_cuda.h>
@ -62,165 +62,165 @@ float *d_B;
 float *d_C;
 // Functions
-int CleanupNoFailure(CUcontext &cuContext);
+int  CleanupNoFailure(CUcontext &cuContext);
 void RandomInit(float *, int);
 bool findModulePath(const char *, string &, char **, ostringstream &);
-static void check(CUresult result, char const *const func,
+static void check(CUresult result, char const *const func, const char *const file, int const line)
-                  const char *const file, int const line) {
+{
-  if (result) {
+    if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
+        fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
-            static_cast<unsigned int>(result), func);
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
 }
 #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
 // Host code
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("simpleDrvRuntime..\n");
+{
-  int N = 50000, devID = 0;
+    printf("simpleDrvRuntime..\n");
-  size_t size = N * sizeof(float);
+    int        N = 50000, devID = 0;
-  CUdevice cuDevice;
+    size_t     size = N * sizeof(float);
-  CUfunction vecAdd_kernel;
+    CUdevice   cuDevice;
-  CUmodule cuModule = 0;
+    CUfunction vecAdd_kernel;
-  CUcontext cuContext;
+    CUmodule   cuModule = 0;
    CUcontext  cuContext;
-  // Initialize
+    // Initialize
-  checkCudaDrvErrors(cuInit(0));
+    checkCudaDrvErrors(cuInit(0));
-  cuDevice = findCudaDevice(argc, (const char **)argv);
+    cuDevice = findCudaDevice(argc, (const char **)argv);
-  // Create context
+    // Create context
-  checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));
-  // first search for the module path before we load the results
+    // first search for the module path before we load the results
-  string module_path;
+    string        module_path;
-  ostringstream fatbin;
+    ostringstream fatbin;
-  if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
+    if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  } else {
+    }
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
+    else {
-  }
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
  if (!fatbin.str().size()) {
    printf("fatbin file empty. exiting..\n");
    exit(EXIT_FAILURE);
  }
  // Create module from binary file (FATBIN)
  checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
  // Get function handle from module
  checkCudaDrvErrors(
      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
  // Allocate input vectors h_A and h_B in host memory
  checkCudaErrors(cudaMallocHost(&h_A, size));
  checkCudaErrors(cudaMallocHost(&h_B, size));
  checkCudaErrors(cudaMallocHost(&h_C, size));
  // Initialize input vectors
  RandomInit(h_A, N);
  RandomInit(h_B, N);
  // Allocate vectors in device memory
  checkCudaErrors(cudaMalloc((void **)(&d_A), size));
  checkCudaErrors(cudaMalloc((void **)(&d_B), size));
  checkCudaErrors(cudaMalloc((void **)(&d_C), size));
  cudaStream_t stream;
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  // Copy vectors from host memory to device memory
  checkCudaErrors(
      cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
  checkCudaErrors(
      cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
  int threadsPerBlock = 256;
  int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
  void *args[] = {&d_A, &d_B, &d_C, &N};
  // Launch the CUDA kernel
  checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
                                    threadsPerBlock, 1, 1, 0, stream, args,
                                    NULL));
  // Copy result from device memory to host memory
  // h_C contains the result in host memory
  checkCudaErrors(
      cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
  checkCudaErrors(cudaStreamSynchronize(stream));
  // Verify result
  int i;
  for (i = 0; i < N; ++i) {
    float sum = h_A[i] + h_B[i];
    if (fabs(h_C[i] - sum) > 1e-7f) {
      break;
    }
  }
-  checkCudaDrvErrors(cuModuleUnload(cuModule));
+    if (!fatbin.str().size()) {
-  CleanupNoFailure(cuContext);
+        printf("fatbin file empty. exiting..\n");
-  printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
+        exit(EXIT_FAILURE);
    }
-  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
+    // Create module from binary file (FATBIN)
    checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
    // Get function handle from module
    checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
    // Allocate input vectors h_A and h_B in host memory
    checkCudaErrors(cudaMallocHost(&h_A, size));
    checkCudaErrors(cudaMallocHost(&h_B, size));
    checkCudaErrors(cudaMallocHost(&h_C, size));
    // Initialize input vectors
    RandomInit(h_A, N);
    RandomInit(h_B, N);
    // Allocate vectors in device memory
    checkCudaErrors(cudaMalloc((void **)(&d_A), size));
    checkCudaErrors(cudaMalloc((void **)(&d_B), size));
    checkCudaErrors(cudaMalloc((void **)(&d_C), size));
    cudaStream_t stream;
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    // Copy vectors from host memory to device memory
    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
    void *args[] = {&d_A, &d_B, &d_C, &N};
    // Launch the CUDA kernel
    checkCudaDrvErrors(
        cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
    checkCudaErrors(cudaStreamSynchronize(stream));
    // Verify result
    int i;
    for (i = 0; i < N; ++i) {
        float sum = h_A[i] + h_B[i];
        if (fabs(h_C[i] - sum) > 1e-7f) {
            break;
        }
    }
    checkCudaDrvErrors(cuModuleUnload(cuModule));
    CleanupNoFailure(cuContext);
    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-int CleanupNoFailure(CUcontext &cuContext) {
+int CleanupNoFailure(CUcontext &cuContext)
-  // Free device memory
+{
-  checkCudaErrors(cudaFree(d_A));
+    // Free device memory
-  checkCudaErrors(cudaFree(d_B));
+    checkCudaErrors(cudaFree(d_A));
-  checkCudaErrors(cudaFree(d_C));
+    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_C));
-  // Free host memory
+    // Free host memory
-  if (h_A) {
+    if (h_A) {
-    checkCudaErrors(cudaFreeHost(h_A));
+        checkCudaErrors(cudaFreeHost(h_A));
-  }
+    }
-  if (h_B) {
+    if (h_B) {
-    checkCudaErrors(cudaFreeHost(h_B));
+        checkCudaErrors(cudaFreeHost(h_B));
-  }
+    }
-  if (h_C) {
+    if (h_C) {
-    checkCudaErrors(cudaFreeHost(h_C));
+        checkCudaErrors(cudaFreeHost(h_C));
-  }
+    }
-  checkCudaDrvErrors(cuCtxDestroy(cuContext));
+    checkCudaDrvErrors(cuCtxDestroy(cuContext));
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n) {
+void RandomInit(float *data, int n)
-  for (int i = 0; i < n; ++i) {
+{
-    data[i] = rand() / (float)RAND_MAX;
+    for (int i = 0; i < n; ++i) {
-  }
+        data[i] = rand() / (float)RAND_MAX;
-}
+    }
-
+}
-bool inline findModulePath(const char *module_file, string &module_path,
+
-                           char **argv, ostringstream &ostrm) {
+bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
-  char *actual_path = sdkFindFilePath(module_file, argv[0]);
+{
-
+    char *actual_path = sdkFindFilePath(module_file, argv[0]);
-  if (actual_path) {
+
-    module_path = actual_path;
+    if (actual_path) {
-  } else {
+        module_path = actual_path;
-    printf("> findModulePath file not found: <%s> \n", module_file);
+    }
-    return false;
+    else {
-  }
+        printf("> findModulePath file not found: <%s> \n", module_file);
-
+        return false;
-  if (module_path.empty()) {
+    }
-    printf("> findModulePath could not find file: <%s> \n", module_file);
+
-    return false;
+    if (module_path.empty()) {
-  } else {
+        printf("> findModulePath could not find file: <%s> \n", module_file);
-    printf("> findModulePath found file at <%s>\n", module_path.c_str());
+        return false;
-    if (module_path.rfind("fatbin") != string::npos) {
+    }
-      ifstream fileIn(module_path.c_str(), ios::binary);
+    else {
-      ostrm << fileIn.rdbuf();
+        printf("> findModulePath found file at <%s>\n", module_path.c_str());
        if (module_path.rfind("fatbin") != string::npos) {
            ifstream fileIn(module_path.c_str(), ios::binary);
            ostrm << fileIn.rdbuf();
        }
        return true;
    }
    return true;
  }
 }
--- a/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/simpleDrvRuntime/vectorAdd_kernel.cu
@ -34,9 +34,10 @@
 */
 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
-                                         float *C, int N) {
+{
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu
+++ b/Samples/0_Introduction/simpleHyperQ/simpleHyperQ.cu
@ -44,188 +44,188 @@ const char *sSDKsample = "hyperQ";
 // This subroutine does no real work but runs for at least the specified number
 // of clock ticks.
-__device__ void clock_block(clock_t *d_o, clock_t clock_count) {
+__device__ void clock_block(clock_t *d_o, clock_t clock_count)
-  unsigned int start_clock = (unsigned int)clock();
+{
    unsigned int start_clock = (unsigned int)clock();
-  clock_t clock_offset = 0;
+    clock_t clock_offset = 0;
-  while (clock_offset < clock_count) {
+    while (clock_offset < clock_count) {
-    unsigned int end_clock = (unsigned int)clock();
+        unsigned int end_clock = (unsigned int)clock();
-    // The code below should work like
+        // The code below should work like
-    // this (thanks to modular arithmetics):
+        // this (thanks to modular arithmetics):
-    //
+        //
-    // clock_offset = (clock_t) (end_clock > start_clock ?
+        // clock_offset = (clock_t) (end_clock > start_clock ?
-    //                           end_clock - start_clock :
+        //                           end_clock - start_clock :
-    //                           end_clock + (0xffffffffu - start_clock));
+        //                           end_clock + (0xffffffffu - start_clock));
-    //
+        //
-    // Indeed, let m = 2^32 then
+        // Indeed, let m = 2^32 then
-    // end - start = end + m - start (mod m).
+        // end - start = end + m - start (mod m).
-    clock_offset = (clock_t)(end_clock - start_clock);
+        clock_offset = (clock_t)(end_clock - start_clock);
-  }
+    }
-  d_o[0] = clock_offset;
+    d_o[0] = clock_offset;
 }
 // We create two identical kernels calling clock_block(), we create two so that
 // we can identify dependencies in the profile timeline ("kernel_B" is always
 // dependent on "kernel_A" in the same stream).
-__global__ void kernel_A(clock_t *d_o, clock_t clock_count) {
+__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
-  clock_block(d_o, clock_count);
+__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
 }
 __global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
  clock_block(d_o, clock_count);
 }
 // Single-warp reduction kernel (note: this is not optimized for simplicity)
-__global__ void sum(clock_t *d_clocks, int N) {
+__global__ void sum(clock_t *d_clocks, int N)
-  // Handle to thread block group
+{
-  cg::thread_block cta = cg::this_thread_block();
+    // Handle to thread block group
-  __shared__ clock_t s_clocks[32];
+    cg::thread_block   cta = cg::this_thread_block();
    __shared__ clock_t s_clocks[32];
-  clock_t my_sum = 0;
+    clock_t my_sum = 0;
-  for (int i = threadIdx.x; i < N; i += blockDim.x) {
+    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-    my_sum += d_clocks[i];
+        my_sum += d_clocks[i];
  }
  s_clocks[threadIdx.x] = my_sum;
  cg::sync(cta);
  for (int i = warpSize / 2; i > 0; i /= 2) {
    if (threadIdx.x < i) {
      s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
    }
    s_clocks[threadIdx.x] = my_sum;
    cg::sync(cta);
  }
-  if (threadIdx.x == 0) {
+    for (int i = warpSize / 2; i > 0; i /= 2) {
-    d_clocks[0] = s_clocks[0];
+        if (threadIdx.x < i) {
-  }
+            s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
-}
+        }
-int main(int argc, char **argv) {
+        cg::sync(cta);
  int nstreams = 32;       // One stream for each pair of kernels
  float kernel_time = 10;  // Time each kernel should run in ms
  float elapsed_time;
  int cuda_device = 0;
  printf("starting %s...\n", sSDKsample);
  // Get number of streams (if overridden on the command line)
  if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
    nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
  }
  // Use command-line specified CUDA device, otherwise use device with
  // highest Gflops/s
  cuda_device = findCudaDevice(argc, (const char **)argv);
  // Get device properties
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDevice(&cuda_device));
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  // HyperQ is available in devices of Compute Capability 3.5 and higher
  if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
    if (deviceProp.concurrentKernels == 0) {
      printf(
          "> GPU does not support concurrent kernel execution (SM 3.5 or "
          "higher required)\n");
      printf("  CUDA kernel runs will be serialized\n");
    } else {
      printf("> GPU does not support HyperQ\n");
      printf("  CUDA kernel runs will have limited concurrency\n");
    }
  }
-  printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
+    if (threadIdx.x == 0) {
-         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
+        d_clocks[0] = s_clocks[0];
-
+    }
-  // Allocate host memory for the output (reduced to a single value)
+}
-  clock_t *a = 0;
+
-  checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
+int main(int argc, char **argv)
-
+{
-  // Allocate device memory for the output (one value for each kernel)
+    int   nstreams    = 32; // One stream for each pair of kernels
-  clock_t *d_a = 0;
+    float kernel_time = 10; // Time each kernel should run in ms
-  checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
+    float elapsed_time;
-
+    int   cuda_device = 0;
-  // Allocate and initialize an array of stream handles
+
-  cudaStream_t *streams =
+    printf("starting %s...\n", sSDKsample);
-      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
+
-
+    // Get number of streams (if overridden on the command line)
-  for (int i = 0; i < nstreams; i++) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
-    checkCudaErrors(cudaStreamCreate(&(streams[i])));
+        nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
-  }
+    }
-
+
-  // Create CUDA event handles
+    // Use command-line specified CUDA device, otherwise use device with
-  cudaEvent_t start_event, stop_event;
+    // highest Gflops/s
-  checkCudaErrors(cudaEventCreate(&start_event));
+    cuda_device = findCudaDevice(argc, (const char **)argv);
-  checkCudaErrors(cudaEventCreate(&stop_event));
+
-
+    // Get device properties
-  // Target time per kernel is kernel_time ms, clockRate is in KHz
+    cudaDeviceProp deviceProp;
-  // Target number of clocks = target time * clock frequency
+    checkCudaErrors(cudaGetDevice(&cuda_device));
-#if defined(__arm__) || defined(__aarch64__)
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-  // the kernel takes more time than the channel reset time on arm archs, so to
+
-  // prevent hangs reduce time_clocks.
+    // HyperQ is available in devices of Compute Capability 3.5 and higher
-  clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
+    if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
-#else
+        if (deviceProp.concurrentKernels == 0) {
-  clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
+            printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
-#endif
+                   "higher required)\n");
-  clock_t total_clocks = 0;
+            printf("  CUDA kernel runs will be serialized\n");
-
+        }
-  // Start the clock
+        else {
-  checkCudaErrors(cudaEventRecord(start_event, 0));
+            printf("> GPU does not support HyperQ\n");
-
+            printf("  CUDA kernel runs will have limited concurrency\n");
-  // Queue pairs of {kernel_A, kernel_B} in separate streams
+        }
-  for (int i = 0; i < nstreams; ++i) {
+    }
-    kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
+
-    total_clocks += time_clocks;
+    printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
-    kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
+           deviceProp.major,
-    total_clocks += time_clocks;
+           deviceProp.minor,
-  }
+           deviceProp.multiProcessorCount);
-
+
-  // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
+    // Allocate host memory for the output (reduced to a single value)
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
+    clock_t *a = 0;
-
+    checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
-  // At this point the CPU has dispatched all work for the GPU and can
+
-  // continue processing other tasks in parallel. In this sample we just want
+    // Allocate device memory for the output (one value for each kernel)
-  // to wait until all work is done so we use a blocking cudaMemcpy below.
+    clock_t *d_a = 0;
-
+    checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
-  // Run the sum kernel and copy the result back to host
+
-  sum<<<1, 32>>>(d_a, 2 * nstreams);
+    // Allocate and initialize an array of stream handles
-  checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
+    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
-
+
-  // stop_event will have been recorded but including the synchronize here to
+    for (int i = 0; i < nstreams; i++) {
-  // prevent copy/paste errors!
+        checkCudaErrors(cudaStreamCreate(&(streams[i])));
-  checkCudaErrors(cudaEventSynchronize(stop_event));
+    }
-  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+
-
+    // Create CUDA event handles
-  printf(
+    cudaEvent_t start_event, stop_event;
-      "Expected time for serial execution of %d sets of kernels is between "
+    checkCudaErrors(cudaEventCreate(&start_event));
-      "approx. %.3fs and %.3fs\n",
+    checkCudaErrors(cudaEventCreate(&stop_event));
-      nstreams, (nstreams + 1) * kernel_time / 1000.0f,
+
-      2 * nstreams * kernel_time / 1000.0f);
+    // Target time per kernel is kernel_time ms, clockRate is in KHz
-  printf(
+    // Target number of clocks = target time * clock frequency
-      "Expected time for fully concurrent execution of %d sets of kernels is "
+#if defined(__arm__) || defined(__aarch64__)
-      "approx. %.3fs\n",
+    // the kernel takes more time than the channel reset time on arm archs, so to
-      nstreams, 2 * kernel_time / 1000.0f);
+    // prevent hangs reduce time_clocks.
-  printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
+    clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
-
+#else
-  bool bTestResult = (a[0] >= total_clocks);
+    clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
-
+#endif
-  // Release resources
+    clock_t total_clocks = 0;
-  for (int i = 0; i < nstreams; i++) {
+
-    cudaStreamDestroy(streams[i]);
+    // Start the clock
-  }
+    checkCudaErrors(cudaEventRecord(start_event, 0));
-
+
-  free(streams);
+    // Queue pairs of {kernel_A, kernel_B} in separate streams
-  cudaEventDestroy(start_event);
+    for (int i = 0; i < nstreams; ++i) {
-  cudaEventDestroy(stop_event);
+        kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
-  cudaFreeHost(a);
+        total_clocks += time_clocks;
-  cudaFree(d_a);
+        kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
-
+        total_clocks += time_clocks;
-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    }
    // Stop the clock in stream 0 (i.e. all previous kernels will be complete)
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    // At this point the CPU has dispatched all work for the GPU and can
    // continue processing other tasks in parallel. In this sample we just want
    // to wait until all work is done so we use a blocking cudaMemcpy below.
    // Run the sum kernel and copy the result back to host
    sum<<<1, 32>>>(d_a, 2 * nstreams);
    checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
    // stop_event will have been recorded but including the synchronize here to
    // prevent copy/paste errors!
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
    printf("Expected time for serial execution of %d sets of kernels is between "
           "approx. %.3fs and %.3fs\n",
           nstreams,
           (nstreams + 1) * kernel_time / 1000.0f,
           2 * nstreams * kernel_time / 1000.0f);
    printf("Expected time for fully concurrent execution of %d sets of kernels is "
           "approx. %.3fs\n",
           nstreams,
           2 * kernel_time / 1000.0f);
    printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
    bool bTestResult = (a[0] >= total_clocks);
    // Release resources
    for (int i = 0; i < nstreams; i++) {
        cudaStreamDestroy(streams[i]);
    }
    free(streams);
    cudaEventDestroy(start_event);
    cudaEventDestroy(stop_event);
    cudaFreeHost(a);
    cudaFree(d_a);
    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleIPC/README.md
+++ b/Samples/0_Introduction/simpleIPC/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleIPC/simpleIPC.cu
+++ b/Samples/0_Introduction/simpleIPC/simpleIPC.cu
@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
 #include "helper_cuda.h"
 #include "helper_multiprocess.h"
 static const char shmName[] = "simpleIPCshm";
@ -39,7 +40,7 @@ static const char shmName[] = "simpleIPCshm";
 // For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
 // in the same way.
 #define MAX_DEVICES (32)
-#define DATA_SIZE (64ULL << 20ULL)  // 64MB
+#define DATA_SIZE   (64ULL << 20ULL) // 64MB
 #if defined(__linux__)
 #define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
@ -49,281 +50,280 @@ static const char shmName[] = "simpleIPCshm";
 #error Unsupported system
 #endif
-typedef struct shmStruct_st {
+typedef struct shmStruct_st
-  size_t nprocesses;
+{
-  int barrier;
+    size_t               nprocesses;
-  int sense;
+    int                  barrier;
-  int devices[MAX_DEVICES];
+    int                  sense;
-  cudaIpcMemHandle_t memHandle[MAX_DEVICES];
+    int                  devices[MAX_DEVICES];
-  cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
+    cudaIpcMemHandle_t   memHandle[MAX_DEVICES];
    cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
 } shmStruct;
-__global__ void simpleKernel(char *ptr, int sz, char val) {
+__global__ void simpleKernel(char *ptr, int sz, char val)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    ptr[idx] = val;
+    for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
-  }
+        ptr[idx] = val;
    }
 }
-static void barrierWait(volatile int *barrier, volatile int *sense,
+static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
-                        unsigned int n) {
+{
-  int count;
+    int count;
-  // Check-in
+    // Check-in
-  count = cpu_atomic_add32(barrier, 1);
+    count = cpu_atomic_add32(barrier, 1);
-  if (count == n)  // Last one in
+    if (count == n) // Last one in
-    *sense = 1;
+        *sense = 1;
-  while (!*sense)
+    while (!*sense)
-    ;
+        ;
-  // Check-out
+    // Check-out
-  count = cpu_atomic_add32(barrier, -1);
+    count = cpu_atomic_add32(barrier, -1);
-  if (count == 0)  // Last one out
+    if (count == 0) // Last one out
-    *sense = 0;
+        *sense = 0;
-  while (*sense)
+    while (*sense)
-    ;
+        ;
 }
-static void childProcess(int id) {
+static void childProcess(int id)
-  volatile shmStruct *shm = NULL;
+{
-  cudaStream_t stream;
+    volatile shmStruct      *shm = NULL;
-  sharedMemoryInfo info;
+    cudaStream_t             stream;
-  size_t procCount, i;
+    sharedMemoryInfo         info;
-  int blocks = 0;
+    size_t                   procCount, i;
-  int threads = 128;
+    int                      blocks  = 0;
-  cudaDeviceProp prop;
+    int                      threads = 128;
-  std::vector<void *> ptrs;
+    cudaDeviceProp           prop;
-  std::vector<cudaEvent_t> events;
+    std::vector<void *>      ptrs;
-  std::vector<char> verification_buffer(DATA_SIZE);
+    std::vector<cudaEvent_t> events;
    std::vector<char>        verification_buffer(DATA_SIZE);
-  if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
+    if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
+        printf("Failed to create shared memory slab\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  shm = (volatile shmStruct *)info.addr;
  procCount = shm->nprocesses;
  printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
  checkCudaErrors(cudaSetDevice(shm->devices[id]));
  checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
  checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
      &blocks, simpleKernel, threads, 0));
  blocks *= prop.multiProcessorCount;
  // Open and track all the allocations and events created in the master
  // process for use later
  for (i = 0; i < procCount; i++) {
    void *ptr = NULL;
    cudaEvent_t event;
    // Notice, we don't need to explicitly enable peer access for
    // allocations on other devices.
    checkCudaErrors(
        cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
                             cudaIpcMemLazyEnablePeerAccess));
    checkCudaErrors(cudaIpcOpenEventHandle(
        &event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
    ptrs.push_back(ptr);
    events.push_back(event);
  }
  // At each iteration of the loop, each sibling process will push work on
  // their respective devices accessing the next peer mapped buffer allocated
  // by the master process (these can come from other sibling processes as
  // well). To coordinate each process' access, we force the stream to wait for
  // the work already accessing this buffer asynchronously through IPC events,
  // allowing the CPU processes to continue to queue more work.
  for (i = 0; i < procCount; i++) {
    size_t bufferId = (i + id) % procCount;
    // Wait for the buffer to be accessed to be ready
    checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
    // Push a simple kernel on it
    simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
                                                 DATA_SIZE, id);
    checkCudaErrors(cudaGetLastError());
    // Signal that this buffer is ready for the next consumer
    checkCudaErrors(cudaEventRecord(events[bufferId], stream));
    // Wait for all my sibling processes to push this stage of their work
    // before proceeding to the next. This prevents siblings from racing
    // ahead and clobbering the recorded event or waiting on the wrong
    // recorded event.
    barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
    if (id == 0) {
      printf("Step %lld done\n", (unsigned long long)i);
    }
-  }
+    shm       = (volatile shmStruct *)info.addr;
    procCount = shm->nprocesses;
-  // Now wait for my buffer to be ready so I can copy it locally and verify it
+    printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
  checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
  checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
                                  cudaMemcpyDeviceToHost, stream));
  // And wait for all the queued up work to complete
  checkCudaErrors(cudaStreamSynchronize(stream));
-  printf("Process %d: verifying...\n", id);
+    checkCudaErrors(cudaSetDevice(shm->devices[id]));
    checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
    checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
    blocks *= prop.multiProcessorCount;
-  // The contents should have the id of the sibling just after me
+    // Open and track all the allocations and events created in the master
-  char compareId = (char)((id + 1) % procCount);
+    // process for use later
-  for (unsigned long long j = 0; j < DATA_SIZE; j++) {
+    for (i = 0; i < procCount; i++) {
-    if (verification_buffer[j] != compareId) {
+        void       *ptr = NULL;
-      printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
+        cudaEvent_t event;
-             (int)verification_buffer[j], (int)compareId);
+
        // Notice, we don't need to explicitly enable peer access for
        // allocations on other devices.
        checkCudaErrors(
            cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
        checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
        ptrs.push_back(ptr);
        events.push_back(event);
    }
  }
-  // Clean up!
+    // At each iteration of the loop, each sibling process will push work on
-  for (i = 0; i < procCount; i++) {
+    // their respective devices accessing the next peer mapped buffer allocated
-    checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
+    // by the master process (these can come from other sibling processes as
-    checkCudaErrors(cudaEventDestroy(events[i]));
+    // well). To coordinate each process' access, we force the stream to wait for
-  }
+    // the work already accessing this buffer asynchronously through IPC events,
    // allowing the CPU processes to continue to queue more work.
    for (i = 0; i < procCount; i++) {
        size_t bufferId = (i + id) % procCount;
        // Wait for the buffer to be accessed to be ready
        checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
        // Push a simple kernel on it
        simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
        checkCudaErrors(cudaGetLastError());
        // Signal that this buffer is ready for the next consumer
        checkCudaErrors(cudaEventRecord(events[bufferId], stream));
        // Wait for all my sibling processes to push this stage of their work
        // before proceeding to the next. This prevents siblings from racing
        // ahead and clobbering the recorded event or waiting on the wrong
        // recorded event.
        barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
        if (id == 0) {
            printf("Step %lld done\n", (unsigned long long)i);
        }
    }
-  checkCudaErrors(cudaStreamDestroy(stream));
+    // Now wait for my buffer to be ready so I can copy it locally and verify it
    checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
    checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
    // And wait for all the queued up work to complete
    checkCudaErrors(cudaStreamSynchronize(stream));
-  printf("Process %d complete!\n", id);
+    printf("Process %d: verifying...\n", id);
    // The contents should have the id of the sibling just after me
    char compareId = (char)((id + 1) % procCount);
    for (unsigned long long j = 0; j < DATA_SIZE; j++) {
        if (verification_buffer[j] != compareId) {
            printf("Process %d: Verification mismatch at %lld: %d != %d\n",
                   id,
                   j,
                   (int)verification_buffer[j],
                   (int)compareId);
        }
    }
    // Clean up!
    for (i = 0; i < procCount; i++) {
        checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
        checkCudaErrors(cudaEventDestroy(events[i]));
    }
    checkCudaErrors(cudaStreamDestroy(stream));
    printf("Process %d complete!\n", id);
 }
-static void parentProcess(char *app) {
+static void parentProcess(char *app)
-  sharedMemoryInfo info;
+{
-  int devCount, i;
+    sharedMemoryInfo         info;
-  volatile shmStruct *shm = NULL;
+    int                      devCount, i;
-  std::vector<void *> ptrs;
+    volatile shmStruct      *shm = NULL;
-  std::vector<cudaEvent_t> events;
+    std::vector<void *>      ptrs;
-  std::vector<Process> processes;
+    std::vector<cudaEvent_t> events;
    std::vector<Process>     processes;
-  checkCudaErrors(cudaGetDeviceCount(&devCount));
+    checkCudaErrors(cudaGetDeviceCount(&devCount));
-  if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
+    if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
-    printf("Failed to create shared memory slab\n");
+        printf("Failed to create shared memory slab\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  shm = (volatile shmStruct *)info.addr;
  memset((void *)shm, 0, sizeof(*shm));
  // Pick all the devices that can access each other's memory for this test
  // Keep in mind that CUDA has minimal support for fork() without a
  // corresponding exec() in the child process, but in this case our
  // spawnProcess will always exec, so no need to worry.
  for (i = 0; i < devCount; i++) {
    bool allPeers = true;
    cudaDeviceProp prop;
    checkCudaErrors(cudaGetDeviceProperties(&prop, i));
    // CUDA IPC is only supported on devices with unified addressing
    if (!prop.unifiedAddressing) {
      printf("Device %d does not support unified addressing, skipping...\n", i);
      continue;
    }
-    // This sample requires two processes accessing each device, so we need
+    shm = (volatile shmStruct *)info.addr;
-    // to ensure exclusive or prohibited mode is not set
+    memset((void *)shm, 0, sizeof(*shm));
-    if (prop.computeMode != cudaComputeModeDefault) {
+
-      printf("Device %d is in an unsupported compute mode for this sample\n",
+    // Pick all the devices that can access each other's memory for this test
-             i);
+    // Keep in mind that CUDA has minimal support for fork() without a
-      continue;
+    // corresponding exec() in the child process, but in this case our
    // spawnProcess will always exec, so no need to worry.
    for (i = 0; i < devCount; i++) {
        bool           allPeers = true;
        cudaDeviceProp prop;
        checkCudaErrors(cudaGetDeviceProperties(&prop, i));
        // CUDA IPC is only supported on devices with unified addressing
        if (!prop.unifiedAddressing) {
            printf("Device %d does not support unified addressing, skipping...\n", i);
            continue;
        }
        // This sample requires two processes accessing each device, so we need
        // to ensure exclusive or prohibited mode is not set
        if (prop.computeMode != cudaComputeModeDefault) {
            printf("Device %d is in an unsupported compute mode for this sample\n", i);
            continue;
        }
        for (int j = 0; j < shm->nprocesses; j++) {
            int canAccessPeerIJ, canAccessPeerJI;
            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
            checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
            if (!canAccessPeerIJ || !canAccessPeerJI) {
                allPeers = false;
                break;
            }
        }
        if (allPeers) {
            // Enable peers here.  This isn't necessary for IPC, but it will
            // setup the peers for the device.  For systems that only allow 8
            // peers per GPU at a time, this acts to remove devices from CanAccessPeer
            for (int j = 0; j < shm->nprocesses; j++) {
                checkCudaErrors(cudaSetDevice(i));
                checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
                checkCudaErrors(cudaSetDevice(shm->devices[j]));
                checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
            }
            shm->devices[shm->nprocesses++] = i;
            if (shm->nprocesses >= MAX_DEVICES)
                break;
        }
        else {
            printf("Device %d is not peer capable with some other selected peers, "
                   "skipping\n",
                   i);
        }
    }
-    for (int j = 0; j < shm->nprocesses; j++) {
+    if (shm->nprocesses == 0) {
-      int canAccessPeerIJ, canAccessPeerJI;
+        printf("No CUDA devices support IPC\n");
-      checkCudaErrors(
+        exit(EXIT_WAIVED);
          cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
      checkCudaErrors(
          cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
      if (!canAccessPeerIJ || !canAccessPeerJI) {
        allPeers = false;
        break;
      }
    }
    if (allPeers) {
      // Enable peers here.  This isn't necessary for IPC, but it will
      // setup the peers for the device.  For systems that only allow 8
      // peers per GPU at a time, this acts to remove devices from CanAccessPeer
      for (int j = 0; j < shm->nprocesses; j++) {
        checkCudaErrors(cudaSetDevice(i));
        checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
        checkCudaErrors(cudaSetDevice(shm->devices[j]));
        checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
      }
      shm->devices[shm->nprocesses++] = i;
      if (shm->nprocesses >= MAX_DEVICES) break;
    } else {
      printf(
          "Device %d is not peer capable with some other selected peers, "
          "skipping\n",
          i);
    }
  }
  if (shm->nprocesses == 0) {
    printf("No CUDA devices support IPC\n");
    exit(EXIT_WAIVED);
  }
  // Now allocate memory and an event for each process and fill the shared
  // memory buffer with the IPC handles to communicate
  for (i = 0; i < shm->nprocesses; i++) {
    void *ptr = NULL;
    cudaEvent_t event;
    checkCudaErrors(cudaSetDevice(shm->devices[i]));
    checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
    checkCudaErrors(
        cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
    checkCudaErrors(cudaEventCreate(
        &event, cudaEventDisableTiming | cudaEventInterprocess));
    checkCudaErrors(cudaIpcGetEventHandle(
        (cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
    ptrs.push_back(ptr);
    events.push_back(event);
  }
  // Launch the child processes!
  for (i = 0; i < shm->nprocesses; i++) {
    char devIdx[12];  // Increased size to ensure enough space for formatted integer
    char *const args[] = {app, devIdx, NULL};
    Process process;
    snprintf(devIdx, sizeof(devIdx), "%d", i);
    if (spawnProcess(&process, app, args)) {
      printf("Failed to create process\n");
      exit(EXIT_FAILURE);
    }
-    processes.push_back(process);
+    // Now allocate memory and an event for each process and fill the shared
-  }
+    // memory buffer with the IPC handles to communicate
    for (i = 0; i < shm->nprocesses; i++) {
        void       *ptr = NULL;
        cudaEvent_t event;
-  // And wait for them to finish
+        checkCudaErrors(cudaSetDevice(shm->devices[i]));
-  for (i = 0; i < processes.size(); i++) {
+        checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
-    if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
+        checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
-      printf("Process %d failed!\n", i);
+        checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
-      exit(EXIT_FAILURE);
+        checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
        ptrs.push_back(ptr);
        events.push_back(event);
    }
  }
-  // Clean up!
+    // Launch the child processes!
-  for (i = 0; i < shm->nprocesses; i++) {
+    for (i = 0; i < shm->nprocesses; i++) {
-    checkCudaErrors(cudaSetDevice(shm->devices[i]));
+        char        devIdx[12]; // Increased size to ensure enough space for formatted integer
-    checkCudaErrors(cudaEventSynchronize(events[i]));
+        char *const args[] = {app, devIdx, NULL};
-    checkCudaErrors(cudaEventDestroy(events[i]));
+        Process     process;
    checkCudaErrors(cudaFree(ptrs[i]));
  }
-  sharedMemoryClose(&info);
+        snprintf(devIdx, sizeof(devIdx), "%d", i);
        if (spawnProcess(&process, app, args)) {
            printf("Failed to create process\n");
            exit(EXIT_FAILURE);
        }
        processes.push_back(process);
    }
    // And wait for them to finish
    for (i = 0; i < processes.size(); i++) {
        if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
            printf("Process %d failed!\n", i);
            exit(EXIT_FAILURE);
        }
    }
    // Clean up!
    for (i = 0; i < shm->nprocesses; i++) {
        checkCudaErrors(cudaSetDevice(shm->devices[i]));
        checkCudaErrors(cudaEventSynchronize(events[i]));
        checkCudaErrors(cudaEventDestroy(events[i]));
        checkCudaErrors(cudaFree(ptrs[i]));
    }
    sharedMemoryClose(&info);
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
 {
 #if defined(__arm__) || defined(__aarch64__)
-  printf("Not supported on ARM\n");
+    printf("Not supported on ARM\n");
-  return EXIT_WAIVED;
+    return EXIT_WAIVED;
 #else
-  if (argc == 1) {
+    if (argc == 1) {
-    parentProcess(argv[0]);
+        parentProcess(argv[0]);
-  } else {
+    }
-    childProcess(atoi(argv[1]));
+    else {
-  }
+        childProcess(atoi(argv[1]));
-  return EXIT_SUCCESS;
+    }
    return EXIT_SUCCESS;
 #endif
 }
--- a/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu
+++ b/Samples/0_Introduction/simpleLayeredTexture/simpleLayeredTexture.cu
@ -26,27 +26,27 @@
 */
 /*
-* This sample demonstrates how to use texture fetches from layered 2D textures
+ * This sample demonstrates how to use texture fetches from layered 2D textures
-* in CUDA C
+ * in CUDA C
-*
+ *
-* This sample first generates a 3D input data array for the layered texture
+ * This sample first generates a 3D input data array for the layered texture
-* and the expected output. Then it starts CUDA C kernels, one for each layer,
+ * and the expected output. Then it starts CUDA C kernels, one for each layer,
-* which fetch their layer's texture data (using normalized texture coordinates)
+ * which fetch their layer's texture data (using normalized texture coordinates)
-* transform it to the expected output, and write it to a 3D output data array.
+ * transform it to the expected output, and write it to a 3D output data array.
-*/
+ */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, kernels
 #include <cuda_runtime.h>
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples
 static const char *sSDKname = "simpleLayeredTexture";
@ -54,163 +54,156 @@ static const char *sSDKname = "simpleLayeredTexture";
 //! Transform a layer of a layered 2D texture using texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *g_odata, int width, int height,
+__global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
-                                int layer, cudaTextureObject_t tex) {
+{
-  // calculate this thread's data point
+    // calculate this thread's data point
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-  // 0.5f offset and division are necessary to access the original data points
+    // 0.5f offset and division are necessary to access the original data points
-  // in the texture (such that bilinear interpolation will not be activated).
+    // in the texture (such that bilinear interpolation will not be activated).
-  // For details, see also CUDA Programming Guide, Appendix D
+    // For details, see also CUDA Programming Guide, Appendix D
-  float u = (x + 0.5f) / (float)width;
+    float u = (x + 0.5f) / (float)width;
-  float v = (y + 0.5f) / (float)height;
+    float v = (y + 0.5f) / (float)height;
-  // read from texture, do expected transformation and write to global memory
+    // read from texture, do expected transformation and write to global memory
-  g_odata[layer * width * height + y * width + x] =
+    g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
      -tex2DLayered<float>(tex, u, v, layer) + layer;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("[%s] - Starting...\n", sSDKname);
+{
    printf("[%s] - Starting...\n", sSDKname);
-  // use command-line specified CUDA device, otherwise use device with highest
+    // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
+    // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+    int devID = findCudaDevice(argc, (const char **)argv);
-  bool bResult = true;
+    bool bResult = true;
-  // get number of SMs on this GPU
+    // get number of SMs on this GPU
-  cudaDeviceProp deviceProps;
+    cudaDeviceProp deviceProps;
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
+    printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
-         deviceProps.multiProcessorCount);
+    printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
  printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
-  // generate input data for layered texture
+    // generate input data for layered texture
-  unsigned int width = 512, height = 512, num_layers = 5;
+    unsigned int width = 512, height = 512, num_layers = 5;
-  unsigned int size = width * height * num_layers * sizeof(float);
+    unsigned int size   = width * height * num_layers * sizeof(float);
-  float *h_data = (float *)malloc(size);
+    float       *h_data = (float *)malloc(size);
-  for (unsigned int layer = 0; layer < num_layers; layer++)
+    for (unsigned int layer = 0; layer < num_layers; layer++)
-    for (int i = 0; i < (int)(width * height); i++) {
+        for (int i = 0; i < (int)(width * height); i++) {
-      h_data[layer * width * height + i] = (float)i;
+            h_data[layer * width * height + i] = (float)i;
        }
    // this is the expected transformation of the input data (the expected output)
    float *h_data_ref = (float *)malloc(size);
    for (unsigned int layer = 0; layer < num_layers; layer++)
        for (int i = 0; i < (int)(width * height); i++) {
            h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
        }
    // allocate device memory for result
    float *d_data = NULL;
    checkCudaErrors(cudaMalloc((void **)&d_data, size));
    // allocate array and copy image data
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
    cudaArray            *cu_3darray;
    checkCudaErrors(
        cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
    cudaMemcpy3DParms myparms = {0};
    myparms.srcPos            = make_cudaPos(0, 0, 0);
    myparms.dstPos            = make_cudaPos(0, 0, 0);
    myparms.srcPtr            = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
    myparms.dstArray          = cu_3darray;
    myparms.extent            = make_cudaExtent(width, height, num_layers);
    myparms.kind              = cudaMemcpyHostToDevice;
    checkCudaErrors(cudaMemcpy3D(&myparms));
    cudaTextureObject_t tex;
    cudaResourceDesc    texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));
    texRes.resType         = cudaResourceTypeArray;
    texRes.res.array.array = cu_3darray;
    cudaTextureDesc texDescr;
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
    texDescr.normalizedCoords = true;
    texDescr.filterMode       = cudaFilterModeLinear;
    texDescr.addressMode[0]   = cudaAddressModeWrap;
    texDescr.addressMode[1]   = cudaAddressModeWrap;
    texDescr.readMode         = cudaReadModeElementType;
    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
    dim3 dimBlock(8, 8, 1);
    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
    printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
           "8 x 8 threads\n",
           width,
           height,
           dimGrid.x,
           dimGrid.y);
    transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
                                           tex); // warmup (for better timing)
    // check if kernel execution generated an error
    getLastCudaError("warmup Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
    // execute the kernel
    for (unsigned int layer = 0; layer < num_layers; layer++)
        transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
    // check if kernel execution generated an error
    getLastCudaError("Kernel execution failed");
    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timer);
    printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
    printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
    sdkDeleteTimer(&timer);
    // allocate mem for the result on host side
    float *h_odata = (float *)malloc(size);
    // copy result from device to host
    checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
    // write regression file if necessary
    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
        // write file for regression test
        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
    }
-
+    else {
-  // this is the expected transformation of the input data (the expected output)
+        printf("Comparing kernel output to expected data\n");
  float *h_data_ref = (float *)malloc(size);
  for (unsigned int layer = 0; layer < num_layers; layer++)
    for (int i = 0; i < (int)(width * height); i++) {
      h_data_ref[layer * width * height + i] =
          -h_data[layer * width * height + i] + layer;
    }
  // allocate device memory for result
  float *d_data = NULL;
  checkCudaErrors(cudaMalloc((void **)&d_data, size));
  // allocate array and copy image data
  cudaChannelFormatDesc channelDesc =
      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
  cudaArray *cu_3darray;
  checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
                                    make_cudaExtent(width, height, num_layers),
                                    cudaArrayLayered));
  cudaMemcpy3DParms myparms = {0};
  myparms.srcPos = make_cudaPos(0, 0, 0);
  myparms.dstPos = make_cudaPos(0, 0, 0);
  myparms.srcPtr =
      make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
  myparms.dstArray = cu_3darray;
  myparms.extent = make_cudaExtent(width, height, num_layers);
  myparms.kind = cudaMemcpyHostToDevice;
  checkCudaErrors(cudaMemcpy3D(&myparms));
  cudaTextureObject_t tex;
  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));
  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = cu_3darray;
  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
  texDescr.normalizedCoords = true;
  texDescr.filterMode = cudaFilterModeLinear;
  texDescr.addressMode[0] = cudaAddressModeWrap;
  texDescr.addressMode[1] = cudaAddressModeWrap;
  texDescr.readMode = cudaReadModeElementType;
  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
  dim3 dimBlock(8, 8, 1);
  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
  printf(
      "Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
      "8 x 8 threads\n",
      width, height, dimGrid.x, dimGrid.y);
  transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
                                         tex);  // warmup (for better timing)
  // check if kernel execution generated an error
  getLastCudaError("warmup Kernel execution failed");
  checkCudaErrors(cudaDeviceSynchronize());
  StopWatchInterface *timer = NULL;
  sdkCreateTimer(&timer);
  sdkStartTimer(&timer);
  // execute the kernel
  for (unsigned int layer = 0; layer < num_layers; layer++)
    transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer,
                                              tex);
  // check if kernel execution generated an error
  getLastCudaError("Kernel execution failed");
  checkCudaErrors(cudaDeviceSynchronize());
  sdkStopTimer(&timer);
  printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
  printf("%.2f Mtexlookups/sec\n",
         (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
          1e6));
  sdkDeleteTimer(&timer);
  // allocate mem for the result on host side
  float *h_odata = (float *)malloc(size);
  // copy result from device to host
  checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
  // write regression file if necessary
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
    // write file for regression test
    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
                        false);
  } else {
    printf("Comparing kernel output to expected data\n");
 #define MIN_EPSILON_ERROR 5e-3f
-    bResult = compareData(h_odata, h_data_ref, width * height * num_layers,
+        bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
-                          MIN_EPSILON_ERROR, 0.0f);
+    }
  }
-  // cleanup memory
+    // cleanup memory
-  free(h_data);
+    free(h_data);
-  free(h_data_ref);
+    free(h_data_ref);
-  free(h_odata);
+    free(h_odata);
-  checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(d_data));
+    checkCudaErrors(cudaFree(d_data));
-  checkCudaErrors(cudaFreeArray(cu_3darray));
+    checkCudaErrors(cudaFreeArray(cu_3darray));
-  exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.cpp
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cpp
@ -26,15 +26,15 @@
 */
 /* Simple example demonstrating how to use MPI with CUDA
-*
+ *
-*  Generate some random numbers on one node.
+ *  Generate some random numbers on one node.
-*  Dispatch them to all nodes.
+ *  Dispatch them to all nodes.
-*  Compute their square root on each node's GPU.
+ *  Compute their square root on each node's GPU.
-*  Compute the average of the results using MPI.
+ *  Compute the average of the results using MPI.
-*
+ *
-*  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
+ *  simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
-*                 on Windows, please download the Microsoft HPC Pack SDK 2008
+ *                 on Windows, please download the Microsoft HPC Pack SDK 2008
-*/
+ */
 // MPI include
 #include <mpi.h>
@ -42,87 +42,88 @@
 // System includes
 #include <iostream>
 using std::cout;
 using std::cerr;
 using std::cout;
 using std::endl;
 // User include
 #include "simpleMPI.h"
 // Error handling macros
-#define MPI_CHECK(call)                          \
+#define MPI_CHECK(call)                              \
-  if ((call) != MPI_SUCCESS) {                   \
+    if ((call) != MPI_SUCCESS) {                     \
-    cerr << "MPI error calling \"" #call "\"\n"; \
+        cerr << "MPI error calling \"" #call "\"\n"; \
-    my_abort(-1);                                \
+        my_abort(-1);                                \
-  }
+    }
 // Host code
 // No CUDA here, only MPI
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
-  // Dimensions of the dataset
+{
-  int blockSize = 256;
+    // Dimensions of the dataset
-  int gridSize = 10000;
+    int blockSize       = 256;
-  int dataSizePerNode = gridSize * blockSize;
+    int gridSize        = 10000;
    int dataSizePerNode = gridSize * blockSize;
-  // Initialize MPI state
+    // Initialize MPI state
-  MPI_CHECK(MPI_Init(&argc, &argv));
+    MPI_CHECK(MPI_Init(&argc, &argv));
-  // Get our MPI node number and node count
+    // Get our MPI node number and node count
-  int commSize, commRank;
+    int commSize, commRank;
-  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
+    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
-  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
+    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
-  // Generate some random numbers on the root node (node 0)
+    // Generate some random numbers on the root node (node 0)
-  int dataSizeTotal = dataSizePerNode * commSize;
+    int    dataSizeTotal = dataSizePerNode * commSize;
-  float *dataRoot = NULL;
+    float *dataRoot      = NULL;
-  // Are we the root node?
+    // Are we the root node?
-  if (commRank == 0) {
+    if (commRank == 0) {
-    cout << "Running on " << commSize << " nodes" << endl;
+        cout << "Running on " << commSize << " nodes" << endl;
-    dataRoot = new float[dataSizeTotal];
+        dataRoot = new float[dataSizeTotal];
-    initData(dataRoot, dataSizeTotal);
+        initData(dataRoot, dataSizeTotal);
-  }
+    }
-  // Allocate a buffer on each node
+    // Allocate a buffer on each node
-  float *dataNode = new float[dataSizePerNode];
+    float *dataNode = new float[dataSizePerNode];
-  // Dispatch a portion of the input data to each node
+    // Dispatch a portion of the input data to each node
-  MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
+    MPI_CHECK(
-                        dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
+        MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
-  if (commRank == 0) {
+    if (commRank == 0) {
-    // No need for root data any more
+        // No need for root data any more
-    delete[] dataRoot;
+        delete[] dataRoot;
-  }
+    }
-  // On each node, run computation on GPU
+    // On each node, run computation on GPU
-  computeGPU(dataNode, blockSize, gridSize);
+    computeGPU(dataNode, blockSize, gridSize);
-  // Reduction to the root node, computing the sum of output elements
+    // Reduction to the root node, computing the sum of output elements
-  float sumNode = sum(dataNode, dataSizePerNode);
+    float sumNode = sum(dataNode, dataSizePerNode);
-  float sumRoot;
+    float sumRoot;
-  MPI_CHECK(
+    MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
      MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
-  if (commRank == 0) {
+    if (commRank == 0) {
-    float average = sumRoot / dataSizeTotal;
+        float average = sumRoot / dataSizeTotal;
-    cout << "Average of square roots is: " << average << endl;
+        cout << "Average of square roots is: " << average << endl;
-  }
+    }
-  // Cleanup
+    // Cleanup
-  delete[] dataNode;
+    delete[] dataNode;
-  MPI_CHECK(MPI_Finalize());
+    MPI_CHECK(MPI_Finalize());
-  if (commRank == 0) {
+    if (commRank == 0) {
-    cout << "PASSED\n";
+        cout << "PASSED\n";
-  }
+    }
-  return 0;
+    return 0;
 }
 // Shut down MPI cleanly if something goes wrong
-void my_abort(int err) {
+void my_abort(int err)
-  cout << "Test FAILED\n";
+{
-  MPI_Abort(MPI_COMM_WORLD, err);
+    cout << "Test FAILED\n";
    MPI_Abort(MPI_COMM_WORLD, err);
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.cu
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.cu
@ -26,14 +26,14 @@
 */
 /* Simple example demonstrating how to use MPI with CUDA
-*
+ *
-*  Generate some random numbers on one node.
+ *  Generate some random numbers on one node.
-*  Dispatch them to all nodes.
+ *  Dispatch them to all nodes.
-*  Compute their square root on each node's GPU.
+ *  Compute their square root on each node's GPU.
-*  Compute the average of the results using MPI.
+ *  Compute the average of the results using MPI.
-*
+ *
-*  simpleMPI.cu: GPU part, compiled with nvcc
+ *  simpleMPI.cu: GPU part, compiled with nvcc
-*/
+ */
 #include <iostream>
 using std::cerr;
@ -42,61 +42,63 @@ using std::endl;
 #include "simpleMPI.h"
 // Error handling macro
-#define CUDA_CHECK(call)                                                 \
+#define CUDA_CHECK(call)                                                     \
-  if ((call) != cudaSuccess) {                                           \
+    if ((call) != cudaSuccess) {                                             \
-    cudaError_t err = cudaGetLastError();                                \
+        cudaError_t err = cudaGetLastError();                                \
-    cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
+        cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
-    my_abort(err);                                                       \
+        my_abort(err);                                                       \
-  }
+    }
 // Device code
 // Very simple GPU Kernel that computes square roots of input numbers
-__global__ void simpleMPIKernel(float *input, float *output) {
+__global__ void simpleMPIKernel(float *input, float *output)
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  output[tid] = sqrt(input[tid]);
+    int tid     = blockIdx.x * blockDim.x + threadIdx.x;
    output[tid] = sqrt(input[tid]);
 }
 // Initialize an array with random data (between 0 and 1)
-void initData(float *data, int dataSize) {
+void initData(float *data, int dataSize)
-  for (int i = 0; i < dataSize; i++) {
+{
-    data[i] = (float)rand() / RAND_MAX;
+    for (int i = 0; i < dataSize; i++) {
-  }
+        data[i] = (float)rand() / RAND_MAX;
    }
 }
 // CUDA computation on each node
 // No MPI here, only CUDA
-void computeGPU(float *hostData, int blockSize, int gridSize) {
+void computeGPU(float *hostData, int blockSize, int gridSize)
-  int dataSize = blockSize * gridSize;
+{
    int dataSize = blockSize * gridSize;
-  // Allocate data on GPU memory
+    // Allocate data on GPU memory
-  float *deviceInputData = NULL;
+    float *deviceInputData = NULL;
-  CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
-  float *deviceOutputData = NULL;
+    float *deviceOutputData = NULL;
-  CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
+    CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
-  // Copy to GPU memory
+    // Copy to GPU memory
-  CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float),
+    CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
                        cudaMemcpyHostToDevice));
-  // Run kernel
+    // Run kernel
-  simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
+    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
-  // Copy data back to CPU memory
+    // Copy data back to CPU memory
-  CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float),
+    CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
                        cudaMemcpyDeviceToHost));
-  // Free GPU memory
+    // Free GPU memory
-  CUDA_CHECK(cudaFree(deviceInputData));
+    CUDA_CHECK(cudaFree(deviceInputData));
-  CUDA_CHECK(cudaFree(deviceOutputData));
+    CUDA_CHECK(cudaFree(deviceOutputData));
 }
-float sum(float *data, int size) {
+float sum(float *data, int size)
-  float accum = 0.f;
+{
    float accum = 0.f;
-  for (int i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
-    accum += data[i];
+        accum += data[i];
-  }
+    }
-  return accum;
+    return accum;
 }
--- a/Samples/0_Introduction/simpleMPI/simpleMPI.h
+++ b/Samples/0_Introduction/simpleMPI/simpleMPI.h
@ -26,19 +26,20 @@
 */
 /* Simple example demonstrating how to use MPI with CUDA
-*
+ *
-*  Generate some random numbers on one node.
+ *  Generate some random numbers on one node.
-*  Dispatch them to all nodes.
+ *  Dispatch them to all nodes.
-*  Compute their square root on each node's GPU.
+ *  Compute their square root on each node's GPU.
-*  Compute the average of the results using MPI.
+ *  Compute the average of the results using MPI.
-*
+ *
-*  simpleMPI.h: common header file
+ *  simpleMPI.h: common header file
-*/
+ */
 // Forward declarations
-extern "C" {
+extern "C"
-void initData(float *data, int dataSize);
+{
-void computeGPU(float *hostData, int blockSize, int gridSize);
+    void  initData(float *data, int dataSize);
-float sum(float *data, int size);
+    void  computeGPU(float *hostData, int blockSize, int gridSize);
-void my_abort(int err);
+    float sum(float *data, int size);
    void  my_abort(int err);
 }
--- a/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
+++ b/Samples/0_Introduction/simpleMultiCopy/simpleMultiCopy.cu
@ -38,7 +38,7 @@
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
-*/
+ */
 const char *sSDKname = "simpleMultiCopy";
@ -50,25 +50,26 @@ const char *sSDKname = "simpleMultiCopy";
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples
 // includes, kernels
 // Declare the CUDA kernels here and main() code that is needed to launch
 // Compute workload on the system
-__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
+__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < N) {
+    if (idx < N) {
-    for (int i = 0; i < inner_reps; ++i) {
+        for (int i = 0; i < inner_reps; ++i) {
-      g_out[idx] = g_in[idx] + 1;
+            g_out[idx] = g_in[idx] + 1;
        }
    }
  }
 }
 #define STREAM_COUNT 4
 // Uncomment to simulate data source/sink IO times
-//#define SIMULATE_IO
+// #define SIMULATE_IO
 int *h_data_source;
 int *h_data_sink;
@ -79,13 +80,13 @@ int *d_data_in[STREAM_COUNT];
 int *h_data_out[STREAM_COUNT];
 int *d_data_out[STREAM_COUNT];
-cudaEvent_t cycleDone[STREAM_COUNT];
+cudaEvent_t  cycleDone[STREAM_COUNT];
 cudaStream_t stream[STREAM_COUNT];
 cudaEvent_t start, stop;
-int N = 1 << 22;
+int N          = 1 << 22;
-int nreps = 10;  // number of times each experiment is repeated
+int nreps      = 10; // number of times each experiment is repeated
 int inner_reps = 5;
 int memsize;
@ -96,278 +97,268 @@ dim3 grid;
 int thread_blocks;
 float processWithStreams(int streams_used);
-void init();
+void  init();
-bool test();
+bool  test();
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
-  int cuda_device = 0;
+{
-  float scale_factor;
+    int            cuda_device = 0;
-  cudaDeviceProp deviceProp;
+    float          scale_factor;
    cudaDeviceProp deviceProp;
-  printf("[%s] - Starting...\n", sSDKname);
+    printf("[%s] - Starting...\n", sSDKname);
-  if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
-    cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
+        cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
-    if (cuda_device < 0) {
+        if (cuda_device < 0) {
-      printf("Invalid command line parameters\n");
+            printf("Invalid command line parameters\n");
-      exit(EXIT_FAILURE);
+            exit(EXIT_FAILURE);
-    } else {
+        }
-      printf("cuda_device = %d\n", cuda_device);
+        else {
-      cuda_device = gpuDeviceInit(cuda_device);
+            printf("cuda_device = %d\n", cuda_device);
            cuda_device = gpuDeviceInit(cuda_device);
-      if (cuda_device < 0) {
+            if (cuda_device < 0) {
-        printf("No CUDA Capable devices found, exiting...\n");
+                printf("No CUDA Capable devices found, exiting...\n");
-        exit(EXIT_SUCCESS);
+                exit(EXIT_SUCCESS);
-      }
+            }
        }
    }
-  } else {
+    else {
-    // Otherwise pick the device with the highest Gflops/s
+        // Otherwise pick the device with the highest Gflops/s
-    cuda_device = gpuGetMaxGflopsDeviceId();
+        cuda_device = gpuGetMaxGflopsDeviceId();
-    checkCudaErrors(cudaSetDevice(cuda_device));
+        checkCudaErrors(cudaSetDevice(cuda_device));
        checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
        printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
    }
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
-    printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
+    printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
-  }
+           deviceProp.name,
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
+    // Anything that is less than 32 Cores will have scaled down workload
-  printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
+    scale_factor =
-         deviceProp.multiProcessorCount,
+        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+            1.0f);
-         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+    N = (int)((float)N / scale_factor);
             deviceProp.multiProcessorCount);
-  // Anything that is less than 32 Cores will have scaled down workload
+    printf("> Device name: %s\n", deviceProp.name);
-  scale_factor =
+    printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
-      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+           deviceProp.major,
-                    (float)deviceProp.multiProcessorCount)),
+           deviceProp.minor,
-          1.0f);
+           deviceProp.multiProcessorCount);
-  N = (int)((float)N / scale_factor);
+    printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
    printf("> array_size   = %d\n\n", N);
-  printf("> Device name: %s\n", deviceProp.name);
+    memsize = N * sizeof(int);
  printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
         deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
  printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
  printf("> array_size   = %d\n\n", N);
-  memsize = N * sizeof(int);
+    thread_blocks = N / block.x;
-  thread_blocks = N / block.x;
+    grid.x = thread_blocks % 65535;
    grid.y = (thread_blocks / 65535 + 1);
-  grid.x = thread_blocks % 65535;
+    // Allocate resources
  grid.y = (thread_blocks / 65535 + 1);
-  // Allocate resources
+    h_data_source = (int *)malloc(memsize);
    h_data_sink   = (int *)malloc(memsize);
-  h_data_source = (int *)malloc(memsize);
+    for (int i = 0; i < STREAM_COUNT; ++i) {
-  h_data_sink = (int *)malloc(memsize);
+        checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
        checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
        checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
-  for (int i = 0; i < STREAM_COUNT; ++i) {
+        checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
-    checkCudaErrors(
+        checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
        cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
    checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
    checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
-    checkCudaErrors(
+        checkCudaErrors(cudaStreamCreate(&stream[i]));
-        cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
+        checkCudaErrors(cudaEventCreate(&cycleDone[i]));
    checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
-    checkCudaErrors(cudaStreamCreate(&stream[i]));
+        cudaEventRecord(cycleDone[i], stream[i]);
-    checkCudaErrors(cudaEventCreate(&cycleDone[i]));
+    }
-    cudaEventRecord(cycleDone[i], stream[i]);
+    cudaEventCreate(&start);
-  }
+    cudaEventCreate(&stop);
-  cudaEventCreate(&start);
+    init();
  cudaEventCreate(&stop);
-  init();
+    // Kernel warmup
    incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
-  // Kernel warmup
+    // Time copies and kernel
-  incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
+    cudaEventRecord(start, 0);
    checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
-  // Time copies and kernel
+    float memcpy_h2d_time;
-  cudaEventRecord(start, 0);
+    cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
  checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
                                  cudaMemcpyHostToDevice, 0));
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
-  float memcpy_h2d_time;
+    cudaEventRecord(start, 0);
-  cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
+    checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
-  cudaEventRecord(start, 0);
+    float memcpy_d2h_time;
-  checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
+    cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
                                  cudaMemcpyDeviceToHost, 0));
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
-  float memcpy_d2h_time;
+    cudaEventRecord(start, 0);
-  cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
+    incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
-  cudaEventRecord(start, 0);
+    float kernel_time;
-  incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
+    cudaEventElapsedTime(&kernel_time, start, stop);
  cudaEventRecord(stop, 0);
  cudaEventSynchronize(stop);
-  float kernel_time;
+    printf("\n");
-  cudaEventElapsedTime(&kernel_time, start, stop);
+    printf("Relevant properties of this CUDA device\n");
    printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
           "(device property \"deviceOverlap\")\n",
           deviceProp.deviceOverlap ? "X" : " ");
    // printf("(%s) Can execute several GPU kernels simultaneously (compute
    // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
    printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
           "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
           "4000/5000/6000/K5000)\n",
           (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
-  printf("\n");
+    printf("\n");
-  printf("Relevant properties of this CUDA device\n");
+    printf("Measured timings (throughput):\n");
-  printf(
+    printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
-      "(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
+    printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
-      "(device property \"deviceOverlap\")\n",
+    printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);
      deviceProp.deviceOverlap ? "X" : " ");
  // printf("(%s) Can execute several GPU kernels simultaneously (compute
  // capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
  printf(
      "(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
      "    (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
      "4000/5000/6000/K5000)\n",
      (deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
-  printf("\n");
+    printf("\n");
-  printf("Measured timings (throughput):\n");
+    printf("Theoretical limits for speedup gained from overlapped data "
-  printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
+           "transfers:\n");
-         (memsize * 1e-6) / memcpy_h2d_time);
+    printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
-  printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
+    printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
-         (memsize * 1e-6) / memcpy_d2h_time);
+    printf("Compute can overlap with both data transfers: %f ms\n",
-  printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
+           max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
         (inner_reps * memsize * 2e-6) / kernel_time);
-  printf("\n");
+    // Process pipelined work
-  printf(
+    float serial_time  = processWithStreams(1);
-      "Theoretical limits for speedup gained from overlapped data "
+    float overlap_time = processWithStreams(STREAM_COUNT);
      "transfers:\n");
  printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
         memcpy_h2d_time + memcpy_d2h_time + kernel_time);
  printf("Compute can overlap with one transfer: %f ms\n",
         max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
  printf("Compute can overlap with both data transfers: %f ms\n",
         max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
-  // Process pipelined work
+    printf("\nAverage measured timings over %d repetitions:\n", nreps);
-  float serial_time = processWithStreams(1);
+    printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
-  float overlap_time = processWithStreams(STREAM_COUNT);
+    printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
    printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);
-  printf("\nAverage measured timings over %d repetitions:\n", nreps);
+    printf("\nMeasured throughput:\n");
-  printf(" Avg. time when execution fully serialized\t: %f ms\n",
+    printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
-         serial_time / nreps);
+    printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);
  printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
         overlap_time / nreps);
  printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
         (serial_time - overlap_time) / nreps);
-  printf("\nMeasured throughput:\n");
+    // Verify the results, we will use the results for final output
-  printf(" Fully serialized execution\t\t: %f GB/s\n",
+    bool bResults = test();
         (nreps * (memsize * 2e-6)) / serial_time);
  printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
         (nreps * (memsize * 2e-6)) / overlap_time);
-  // Verify the results, we will use the results for final output
+    // Free resources
  bool bResults = test();
-  // Free resources
+    free(h_data_source);
    free(h_data_sink);
-  free(h_data_source);
+    for (int i = 0; i < STREAM_COUNT; ++i) {
-  free(h_data_sink);
+        cudaFreeHost(h_data_in[i]);
        cudaFree(d_data_in[i]);
-  for (int i = 0; i < STREAM_COUNT; ++i) {
+        cudaFreeHost(h_data_out[i]);
-    cudaFreeHost(h_data_in[i]);
+        cudaFree(d_data_out[i]);
    cudaFree(d_data_in[i]);
-    cudaFreeHost(h_data_out[i]);
+        cudaStreamDestroy(stream[i]);
-    cudaFree(d_data_out[i]);
+        cudaEventDestroy(cycleDone[i]);
    }
-    cudaStreamDestroy(stream[i]);
+    cudaEventDestroy(start);
-    cudaEventDestroy(cycleDone[i]);
+    cudaEventDestroy(stop);
  }
-  cudaEventDestroy(start);
+    // Test result
-  cudaEventDestroy(stop);
+    exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
  // Test result
  exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-float processWithStreams(int streams_used) {
+float processWithStreams(int streams_used)
-  int current_stream = 0;
+{
    int current_stream = 0;
-  float time;
+    float time;
-  // Do processing in a loop
+    // Do processing in a loop
-  //
+    //
-  // Note: All memory commands are processed in the order  they are issued,
+    // Note: All memory commands are processed in the order  they are issued,
-  // independent of the stream they are enqueued in. Hence the pattern by
+    // independent of the stream they are enqueued in. Hence the pattern by
-  // which the copy and kernel commands are enqueued in the stream
+    // which the copy and kernel commands are enqueued in the stream
-  // has an influence on the achieved overlap.
+    // has an influence on the achieved overlap.
-  cudaEventRecord(start, 0);
+    cudaEventRecord(start, 0);
-  for (int i = 0; i < nreps; ++i) {
+    for (int i = 0; i < nreps; ++i) {
-    int next_stream = (current_stream + 1) % streams_used;
+        int next_stream = (current_stream + 1) % streams_used;
 #ifdef SIMULATE_IO
-    // Store the result
+        // Store the result
-    memcpy(h_data_sink, h_data_out[current_stream], memsize);
+        memcpy(h_data_sink, h_data_out[current_stream], memsize);
-    // Read new input
+        // Read new input
-    memcpy(h_data_in[next_stream], h_data_source, memsize);
+        memcpy(h_data_in[next_stream], h_data_source, memsize);
 #endif
-    // Ensure that processing and copying of the last cycle has finished
+        // Ensure that processing and copying of the last cycle has finished
-    cudaEventSynchronize(cycleDone[next_stream]);
+        cudaEventSynchronize(cycleDone[next_stream]);
-    // Process current frame
+        // Process current frame
-    incKernel<<<grid, block, 0, stream[current_stream]>>>(
+        incKernel<<<grid, block, 0, stream[current_stream]>>>(
-        d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
+            d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
-    // Upload next frame
+        // Upload next frame
-    checkCudaErrors(
+        checkCudaErrors(cudaMemcpyAsync(
-        cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
+            d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));
                        cudaMemcpyHostToDevice, stream[next_stream]));
-    // Download current frame
+        // Download current frame
-    checkCudaErrors(cudaMemcpyAsync(
+        checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
-        h_data_out[current_stream], d_data_out[current_stream], memsize,
+                                        d_data_out[current_stream],
-        cudaMemcpyDeviceToHost, stream[current_stream]));
+                                        memsize,
                                        cudaMemcpyDeviceToHost,
                                        stream[current_stream]));
-    checkCudaErrors(
+        checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
        cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
-    current_stream = next_stream;
+        current_stream = next_stream;
  }
  cudaEventRecord(stop, 0);
  cudaDeviceSynchronize();
  cudaEventElapsedTime(&time, start, stop);
  return time;
 }
 void init() {
  for (int i = 0; i < N; ++i) {
    h_data_source[i] = 0;
  }
  for (int i = 0; i < STREAM_COUNT; ++i) {
    memcpy(h_data_in[i], h_data_source, memsize);
  }
 }
 bool test() {
  bool passed = true;
  for (int j = 0; j < STREAM_COUNT; ++j) {
    for (int i = 0; i < N; ++i) {
      passed &= (h_data_out[j][i] == 1);
    }
  }
-  return passed;
+    cudaEventRecord(stop, 0);
    cudaDeviceSynchronize();
    cudaEventElapsedTime(&time, start, stop);
    return time;
 }
 void init()
 {
    for (int i = 0; i < N; ++i) {
        h_data_source[i] = 0;
    }
    for (int i = 0; i < STREAM_COUNT; ++i) {
        memcpy(h_data_in[i], h_data_source, memsize);
    }
 }
 bool test()
 {
    bool passed = true;
    for (int j = 0; j < STREAM_COUNT; ++j) {
        for (int i = 0; i < N; ++i) {
            passed &= (h_data_out[j][i] == 1);
        }
    }
    return passed;
 }
--- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu
+++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.cu
@ -37,15 +37,15 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
@ -57,180 +57,176 @@
 // Data configuration
 ////////////////////////////////////////////////////////////////////////////////
 const int MAX_GPU_COUNT = 32;
-const int DATA_N = 1048576 * 32;
+const int DATA_N        = 1048576 * 32;
 ////////////////////////////////////////////////////////////////////////////////
 // Simple reduction kernel.
 // Refer to the 'reduction' CUDA Sample describing
 // reduction optimization strategies
 ////////////////////////////////////////////////////////////////////////////////
-__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
+__global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
-  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+{
-  const int threadN = gridDim.x * blockDim.x;
+    const int tid     = blockIdx.x * blockDim.x + threadIdx.x;
-  float sum = 0;
+    const int threadN = gridDim.x * blockDim.x;
    float     sum     = 0;
-  for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];
+    for (int pos = tid; pos < N; pos += threadN)
        sum += d_Input[pos];
-  d_Result[tid] = sum;
+    d_Result[tid] = sum;
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  // Solver config
+{
-  TGPUplan plan[MAX_GPU_COUNT];
+    // Solver config
    TGPUplan plan[MAX_GPU_COUNT];
-  // GPU reduction results
+    // GPU reduction results
-  float h_SumGPU[MAX_GPU_COUNT];
+    float h_SumGPU[MAX_GPU_COUNT];
-  float sumGPU;
+    float  sumGPU;
-  double sumCPU, diff;
+    double sumCPU, diff;
-  int i, j, gpuBase, GPU_N;
+    int i, j, gpuBase, GPU_N;
-  const int BLOCK_N = 32;
+    const int BLOCK_N  = 32;
-  const int THREAD_N = 256;
+    const int THREAD_N = 256;
-  const int ACCUM_N = BLOCK_N * THREAD_N;
+    const int ACCUM_N  = BLOCK_N * THREAD_N;
-  printf("Starting simpleMultiGPU\n");
+    printf("Starting simpleMultiGPU\n");
-  checkCudaErrors(cudaGetDeviceCount(&GPU_N));
+    checkCudaErrors(cudaGetDeviceCount(&GPU_N));
-  if (GPU_N > MAX_GPU_COUNT) {
+    if (GPU_N > MAX_GPU_COUNT) {
-    GPU_N = MAX_GPU_COUNT;
+        GPU_N = MAX_GPU_COUNT;
  }
  printf("CUDA-capable device count: %i\n", GPU_N);
  printf("Generating input data...\n\n");
  // Subdividing input data across GPUs
  // Get data sizes for each GPU
  for (i = 0; i < GPU_N; i++) {
    plan[i].dataN = DATA_N / GPU_N;
  }
  // Take into account "odd" data sizes
  for (i = 0; i < DATA_N % GPU_N; i++) {
    plan[i].dataN++;
  }
  // Assign data ranges to GPUs
  gpuBase = 0;
  for (i = 0; i < GPU_N; i++) {
    plan[i].h_Sum = h_SumGPU + i;
    gpuBase += plan[i].dataN;
  }
  // Create streams for issuing GPU command asynchronously and allocate memory
  // (GPU and System page-locked)
  for (i = 0; i < GPU_N; i++) {
    checkCudaErrors(cudaSetDevice(i));
    checkCudaErrors(cudaStreamCreate(&plan[i].stream));
    // Allocate memory
    checkCudaErrors(
        cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
    checkCudaErrors(
        cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
                                   ACCUM_N * sizeof(float)));
    checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
                                   plan[i].dataN * sizeof(float)));
    for (j = 0; j < plan[i].dataN; j++) {
      plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
    }
  }
  // Start timing and compute on GPU(s)
  printf("Computing with %d GPUs...\n", GPU_N);
  // create and start timer
  StopWatchInterface *timer = NULL;
  sdkCreateTimer(&timer);
  // start the timer
  sdkStartTimer(&timer);
  // Copy data to GPU, launch the kernel and copy data back. All asynchronously
  for (i = 0; i < GPU_N; i++) {
    // Set device
    checkCudaErrors(cudaSetDevice(i));
    // Copy input data from CPU
    checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data,
                                    plan[i].dataN * sizeof(float),
                                    cudaMemcpyHostToDevice, plan[i].stream));
    // Perform GPU computations
    reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(
        plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
    getLastCudaError("reduceKernel() execution failed.\n");
    // Read back GPU results
    checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum,
                                    ACCUM_N * sizeof(float),
                                    cudaMemcpyDeviceToHost, plan[i].stream));
  }
  // Process GPU results
  for (i = 0; i < GPU_N; i++) {
    float sum;
    // Set device
    checkCudaErrors(cudaSetDevice(i));
    // Wait for all operations to finish
    cudaStreamSynchronize(plan[i].stream);
    // Finalize GPU reduction for current subvector
    sum = 0;
    for (j = 0; j < ACCUM_N; j++) {
      sum += plan[i].h_Sum_from_device[j];
    }
-    *(plan[i].h_Sum) = (float)sum;
+    printf("CUDA-capable device count: %i\n", GPU_N);
-    // Shut down this GPU
+    printf("Generating input data...\n\n");
    checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
    checkCudaErrors(cudaFree(plan[i].d_Sum));
    checkCudaErrors(cudaFree(plan[i].d_Data));
    checkCudaErrors(cudaStreamDestroy(plan[i].stream));
  }
-  sumGPU = 0;
+    // Subdividing input data across GPUs
-
+    // Get data sizes for each GPU
-  for (i = 0; i < GPU_N; i++) {
+    for (i = 0; i < GPU_N; i++) {
-    sumGPU += h_SumGPU[i];
+        plan[i].dataN = DATA_N / GPU_N;
  }
  sdkStopTimer(&timer);
  printf("  GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
  sdkDeleteTimer(&timer);
  // Compute on Host CPU
  printf("Computing with Host CPU...\n\n");
  sumCPU = 0;
  for (i = 0; i < GPU_N; i++) {
    for (j = 0; j < plan[i].dataN; j++) {
      sumCPU += plan[i].h_Data[j];
    }
  }
-  // Compare GPU and CPU results
+    // Take into account "odd" data sizes
-  printf("Comparing GPU and Host CPU results...\n");
+    for (i = 0; i < DATA_N % GPU_N; i++) {
-  diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
+        plan[i].dataN++;
-  printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
+    }
  printf("  Relative difference: %E \n\n", diff);
-  // Cleanup and shutdown
+    // Assign data ranges to GPUs
-  for (i = 0; i < GPU_N; i++) {
+    gpuBase = 0;
    checkCudaErrors(cudaSetDevice(i));
    checkCudaErrors(cudaFreeHost(plan[i].h_Data));
  }
-  exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
+    for (i = 0; i < GPU_N; i++) {
        plan[i].h_Sum = h_SumGPU + i;
        gpuBase += plan[i].dataN;
    }
    // Create streams for issuing GPU command asynchronously and allocate memory
    // (GPU and System page-locked)
    for (i = 0; i < GPU_N; i++) {
        checkCudaErrors(cudaSetDevice(i));
        checkCudaErrors(cudaStreamCreate(&plan[i].stream));
        // Allocate memory
        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
        checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
        checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
        for (j = 0; j < plan[i].dataN; j++) {
            plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
        }
    }
    // Start timing and compute on GPU(s)
    printf("Computing with %d GPUs...\n", GPU_N);
    // create and start timer
    StopWatchInterface *timer = NULL;
    sdkCreateTimer(&timer);
    // start the timer
    sdkStartTimer(&timer);
    // Copy data to GPU, launch the kernel and copy data back. All asynchronously
    for (i = 0; i < GPU_N; i++) {
        // Set device
        checkCudaErrors(cudaSetDevice(i));
        // Copy input data from CPU
        checkCudaErrors(cudaMemcpyAsync(
            plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
        // Perform GPU computations
        reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
        getLastCudaError("reduceKernel() execution failed.\n");
        // Read back GPU results
        checkCudaErrors(cudaMemcpyAsync(
            plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
    }
    // Process GPU results
    for (i = 0; i < GPU_N; i++) {
        float sum;
        // Set device
        checkCudaErrors(cudaSetDevice(i));
        // Wait for all operations to finish
        cudaStreamSynchronize(plan[i].stream);
        // Finalize GPU reduction for current subvector
        sum = 0;
        for (j = 0; j < ACCUM_N; j++) {
            sum += plan[i].h_Sum_from_device[j];
        }
        *(plan[i].h_Sum) = (float)sum;
        // Shut down this GPU
        checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
        checkCudaErrors(cudaFree(plan[i].d_Sum));
        checkCudaErrors(cudaFree(plan[i].d_Data));
        checkCudaErrors(cudaStreamDestroy(plan[i].stream));
    }
    sumGPU = 0;
    for (i = 0; i < GPU_N; i++) {
        sumGPU += h_SumGPU[i];
    }
    sdkStopTimer(&timer);
    printf("  GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
    sdkDeleteTimer(&timer);
    // Compute on Host CPU
    printf("Computing with Host CPU...\n\n");
    sumCPU = 0;
    for (i = 0; i < GPU_N; i++) {
        for (j = 0; j < plan[i].dataN; j++) {
            sumCPU += plan[i].h_Data[j];
        }
    }
    // Compare GPU and CPU results
    printf("Comparing GPU and Host CPU results...\n");
    diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
    printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);
    printf("  Relative difference: %E \n\n", diff);
    // Cleanup and shutdown
    for (i = 0; i < GPU_N; i++) {
        checkCudaErrors(cudaSetDevice(i));
        checkCudaErrors(cudaFreeHost(plan[i].h_Data));
    }
    exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h
+++ b/Samples/0_Introduction/simpleMultiGPU/simpleMultiGPU.h
@ -37,26 +37,26 @@
 #ifndef SIMPLEMULTIGPU_H
 #define SIMPLEMULTIGPU_H
-typedef struct {
+typedef struct
-  // Host-side input data
+{
-  int dataN;
+    // Host-side input data
-  float *h_Data;
+    int    dataN;
    float *h_Data;
-  // Partial sum for this GPU
+    // Partial sum for this GPU
-  float *h_Sum;
+    float *h_Sum;
-  // Device buffers
+    // Device buffers
-  float *d_Data, *d_Sum;
+    float *d_Data, *d_Sum;
-  // Reduction copied back from GPU
+    // Reduction copied back from GPU
-  float *h_Sum_from_device;
+    float *h_Sum_from_device;
-  // Stream for asynchronous command execution
+    // Stream for asynchronous command execution
-  cudaStream_t stream;
+    cudaStream_t stream;
 } TGPUplan;
-extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N,
+extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
                                    int BLOCK_N, int THREAD_N, cudaStream_t &s);
 #endif
--- a/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu
+++ b/Samples/0_Introduction/simpleOccupancy/simpleOccupancy.cu
@ -25,8 +25,8 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <helper_cuda.h> // helper functions for CUDA error check
 #include <iostream>
 #include <helper_cuda.h>  // helper functions for CUDA error check
 const int manualBlockSize = 32;
@ -38,13 +38,14 @@ const int manualBlockSize = 32;
 // execution configuration, including anything the launch configurator
 // API suggests.
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void square(int *array, int arrayCount) {
+__global__ void square(int *array, int arrayCount)
-  extern __shared__ int dynamicSmem[];
+{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    extern __shared__ int dynamicSmem[];
    int                   idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < arrayCount) {
+    if (idx < arrayCount) {
-    array[idx] *= array[idx];
+        array[idx] *= array[idx];
-  }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -58,29 +59,28 @@ __global__ void square(int *array, int arrayCount) {
 // This wrapper routine computes the occupancy of kernel, and reports
 // it in terms of active warps / maximum warps per SM.
 ////////////////////////////////////////////////////////////////////////////////
-static double reportPotentialOccupancy(void *kernel, int blockSize,
+static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
-                                       size_t dynamicSMem) {
+{
-  int device;
+    int            device;
-  cudaDeviceProp prop;
+    cudaDeviceProp prop;
-  int numBlocks;
+    int numBlocks;
-  int activeWarps;
+    int activeWarps;
-  int maxWarps;
+    int maxWarps;
-  double occupancy;
+    double occupancy;
-  checkCudaErrors(cudaGetDevice(&device));
+    checkCudaErrors(cudaGetDevice(&device));
-  checkCudaErrors(cudaGetDeviceProperties(&prop, device));
+    checkCudaErrors(cudaGetDeviceProperties(&prop, device));
-  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));
      &numBlocks, kernel, blockSize, dynamicSMem));
-  activeWarps = numBlocks * blockSize / prop.warpSize;
+    activeWarps = numBlocks * blockSize / prop.warpSize;
-  maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
+    maxWarps    = prop.maxThreadsPerMultiProcessor / prop.warpSize;
-  occupancy = (double)activeWarps / maxWarps;
+    occupancy = (double)activeWarps / maxWarps;
-  return occupancy;
+    return occupancy;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -99,65 +99,63 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
 // This function configures the launch based on the "automatic"
 // argument, records the runtime, and reports occupancy and runtime.
 ////////////////////////////////////////////////////////////////////////////////
-static int launchConfig(int *array, int arrayCount, bool automatic) {
+static int launchConfig(int *array, int arrayCount, bool automatic)
-  int blockSize;
+{
-  int minGridSize;
+    int    blockSize;
-  int gridSize;
+    int    minGridSize;
-  size_t dynamicSMemUsage = 0;
+    int    gridSize;
    size_t dynamicSMemUsage = 0;
-  cudaEvent_t start;
+    cudaEvent_t start;
-  cudaEvent_t end;
+    cudaEvent_t end;
-  float elapsedTime;
+    float elapsedTime;
-  double potentialOccupancy;
+    double potentialOccupancy;
-  checkCudaErrors(cudaEventCreate(&start));
+    checkCudaErrors(cudaEventCreate(&start));
-  checkCudaErrors(cudaEventCreate(&end));
+    checkCudaErrors(cudaEventCreate(&end));
-  if (automatic) {
+    if (automatic) {
-    checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
+        checkCudaErrors(
-        &minGridSize, &blockSize, (void *)square, dynamicSMemUsage,
+            cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));
        arrayCount));
-    std::cout << "Suggested block size: " << blockSize << std::endl
+        std::cout << "Suggested block size: " << blockSize << std::endl
-              << "Minimum grid size for maximum occupancy: " << minGridSize
+                  << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
-              << std::endl;
+    }
-  } else {
+    else {
-    // This block size is too small. Given limited number of
+        // This block size is too small. Given limited number of
-    // active blocks per multiprocessor, the number of active
+        // active blocks per multiprocessor, the number of active
-    // threads will be limited, and thus unable to achieve maximum
+        // threads will be limited, and thus unable to achieve maximum
-    // occupancy.
+        // occupancy.
        //
        blockSize = manualBlockSize;
    }
    // Round up
    //
-    blockSize = manualBlockSize;
+    gridSize = (arrayCount + blockSize - 1) / blockSize;
  }
-  // Round up
+    // Launch and profile
-  //
+    //
-  gridSize = (arrayCount + blockSize - 1) / blockSize;
+    checkCudaErrors(cudaEventRecord(start));
    square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
    checkCudaErrors(cudaEventRecord(end));
-  // Launch and profile
+    checkCudaErrors(cudaDeviceSynchronize());
  //
  checkCudaErrors(cudaEventRecord(start));
  square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
  checkCudaErrors(cudaEventRecord(end));
-  checkCudaErrors(cudaDeviceSynchronize());
+    // Calculate occupancy
    //
    potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
-  // Calculate occupancy
+    std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;
  //
  potentialOccupancy =
      reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
-  std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%"
+    // Report elapsed time
-            << std::endl;
+    //
    checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
    std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
-  // Report elapsed time
+    return 0;
  //
  checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
  std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
  return 0;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -166,41 +164,41 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
 // The test generates an array and squares it with a CUDA kernel, then
 // verifies the result.
 ////////////////////////////////////////////////////////////////////////////////
-static int test(bool automaticLaunchConfig, const int count = 1000000) {
+static int test(bool automaticLaunchConfig, const int count = 1000000)
-  int *array;
+{
-  int *dArray;
+    int *array;
-  int size = count * sizeof(int);
+    int *dArray;
    int  size = count * sizeof(int);
-  array = new int[count];
+    array = new int[count];
-  for (int i = 0; i < count; i += 1) {
+    for (int i = 0; i < count; i += 1) {
-    array[i] = i;
+        array[i] = i;
  }
  checkCudaErrors(cudaMalloc(&dArray, size));
  checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
  for (int i = 0; i < count; i += 1) {
    array[i] = 0;
  }
  launchConfig(dArray, count, automaticLaunchConfig);
  checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
  checkCudaErrors(cudaFree(dArray));
  // Verify the return data
  //
  for (int i = 0; i < count; i += 1) {
    if (array[i] != i * i) {
      std::cout << "element " << i << " expected " << i * i << " actual "
                << array[i] << std::endl;
      return 1;
    }
  }
  delete[] array;
-  return 0;
+    checkCudaErrors(cudaMalloc(&dArray, size));
    checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
    for (int i = 0; i < count; i += 1) {
        array[i] = 0;
    }
    launchConfig(dArray, count, automaticLaunchConfig);
    checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
    checkCudaErrors(cudaFree(dArray));
    // Verify the return data
    //
    for (int i = 0; i < count; i += 1) {
        if (array[i] != i * i) {
            std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
            return 1;
        }
    }
    delete[] array;
    return 0;
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -210,31 +208,31 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
 // automatically configured launch, and reports the occupancy and
 // performance.
 ////////////////////////////////////////////////////////////////////////////////
-int main() {
+int main()
-  int status;
+{
    int status;
-  std::cout << "starting Simple Occupancy" << std::endl << std::endl;
+    std::cout << "starting Simple Occupancy" << std::endl << std::endl;
-  std::cout << "[ Manual configuration with " << manualBlockSize
+    std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;
            << " threads per block ]" << std::endl;
-  status = test(false);
+    status = test(false);
-  if (status) {
+    if (status) {
-    std::cerr << "Test failed\n" << std::endl;
+        std::cerr << "Test failed\n" << std::endl;
-    return -1;
+        return -1;
-  }
+    }
-  std::cout << std::endl;
+    std::cout << std::endl;
-  std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
+    std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
-  status = test(true);
+    status = test(true);
-  if (status) {
+    if (status) {
-    std::cerr << "Test failed\n" << std::endl;
+        std::cerr << "Test failed\n" << std::endl;
-    return -1;
+        return -1;
-  }
+    }
-  std::cout << std::endl;
+    std::cout << std::endl;
-  std::cout << "Test PASSED\n" << std::endl;
+    std::cout << "Test PASSED\n" << std::endl;
-  return 0;
+    return 0;
 }
--- a/Samples/0_Introduction/simpleP2P/README.md
+++ b/Samples/0_Introduction/simpleP2P/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleP2P/simpleP2P.cu
+++ b/Samples/0_Introduction/simpleP2P/simpleP2P.cu
@ -31,230 +31,233 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <stdlib.h>
 // CUDA includes
 #include <cuda_runtime.h>
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples
-__global__ void SimpleKernel(float *src, float *dst) {
+__global__ void SimpleKernel(float *src, float *dst)
-  // Just a dummy kernel, doing enough for us to verify that everything
+{
-  // worked
+    // Just a dummy kernel, doing enough for us to verify that everything
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // worked
-  dst[idx] = src[idx] * 2.0f;
+    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
    dst[idx]      = src[idx] * 2.0f;
 }
 inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("[%s] - Starting...\n", argv[0]);
+{
    printf("[%s] - Starting...\n", argv[0]);
-  if (!IsAppBuiltAs64()) {
+    if (!IsAppBuiltAs64()) {
-    printf(
+        printf("%s is only supported with on 64-bit OSs and the application must be "
-        "%s is only supported with on 64-bit OSs and the application must be "
+               "built as a 64-bit target.  Test is being waived.\n",
-        "built as a 64-bit target.  Test is being waived.\n",
+               argv[0]);
-        argv[0]);
+        exit(EXIT_WAIVED);
    exit(EXIT_WAIVED);
  }
  // Number of GPUs
  printf("Checking for multiple GPUs...\n");
  int gpu_n;
  checkCudaErrors(cudaGetDeviceCount(&gpu_n));
  printf("CUDA-capable device count: %i\n", gpu_n);
  if (gpu_n < 2) {
    printf(
        "Two or more GPUs with Peer-to-Peer access capability are required for "
        "%s.\n",
        argv[0]);
    printf("Waiving test.\n");
    exit(EXIT_WAIVED);
  }
  // Query device properties
  cudaDeviceProp prop[64];
  int gpuid[2];  // we want to find the first two GPU's that can support P2P
  for (int i = 0; i < gpu_n; i++) {
    checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
  }
  // Check possibility for peer access
  printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
  int can_access_peer;
  int p2pCapableGPUs[2];  // We take only 1 pair of P2P capable GPUs
  p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
  // Show all the combinations of supported P2P GPUs
  for (int i = 0; i < gpu_n; i++) {
    for (int j = 0; j < gpu_n; j++) {
      if (i == j) {
        continue;
      }
      checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
      printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
             i, prop[j].name, j, can_access_peer ? "Yes" : "No");
      if (can_access_peer && p2pCapableGPUs[0] == -1) {
        p2pCapableGPUs[0] = i;
        p2pCapableGPUs[1] = j;
      }
    }
  }
-  if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
+    // Number of GPUs
-    printf(
+    printf("Checking for multiple GPUs...\n");
-        "Two or more GPUs with Peer-to-Peer access capability are required for "
+    int gpu_n;
-        "%s.\n",
+    checkCudaErrors(cudaGetDeviceCount(&gpu_n));
-        argv[0]);
+    printf("CUDA-capable device count: %i\n", gpu_n);
    printf(
        "Peer to Peer access is not available amongst GPUs in the system, "
        "waiving test.\n");
-    exit(EXIT_WAIVED);
+    if (gpu_n < 2) {
-  }
+        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
-
+               "%s.\n",
-  // Use first pair of p2p capable GPUs detected.
+               argv[0]);
-  gpuid[0] = p2pCapableGPUs[0];
+        printf("Waiving test.\n");
-  gpuid[1] = p2pCapableGPUs[1];
+        exit(EXIT_WAIVED);
  // Enable peer access
  printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
         gpuid[1]);
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
  // Allocate buffers
  const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
  printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
         int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  float *g0;
  checkCudaErrors(cudaMalloc(&g0, buf_size));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  float *g1;
  checkCudaErrors(cudaMalloc(&g1, buf_size));
  float *h0;
  checkCudaErrors(
      cudaMallocHost(&h0, buf_size));  // Automatically portable with UVA
  // Create CUDA event handles
  printf("Creating event handles...\n");
  cudaEvent_t start_event, stop_event;
  float time_memcpy;
  int eventflags = cudaEventBlockingSync;
  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
  // P2P memcopy() benchmark
  checkCudaErrors(cudaEventRecord(start_event, 0));
  for (int i = 0; i < 100; i++) {
    // With UVA we don't need to specify source and target devices, the
    // runtime figures this out by itself from the pointers
    // Ping-pong copy between GPUs
    if (i % 2 == 0) {
      checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
    } else {
      checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
    }
  }
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
+    // Query device properties
-  checkCudaErrors(cudaEventSynchronize(stop_event));
+    cudaDeviceProp prop[64];
-  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
+    int            gpuid[2]; // we want to find the first two GPU's that can support P2P
  printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
         gpuid[0], gpuid[1],
         (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
             1024.0f / 1024.0f);
-  // Prepare host buffer and copy to GPU 0
+    for (int i = 0; i < gpu_n; i++) {
-  printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
+        checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
  for (int i = 0; i < buf_size / sizeof(float); i++) {
    h0[i] = float(i % 4096);
  }
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
  // Kernel launch configuration
  const dim3 threads(512, 1);
  const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
  // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
  // output to the GPU 1 buffer
  printf(
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
      "GPU%d...\n",
      gpuid[1], gpuid[0], gpuid[1]);
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  SimpleKernel<<<blocks, threads>>>(g0, g1);
  checkCudaErrors(cudaDeviceSynchronize());
  // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
  // output to the GPU 0 buffer
  printf(
      "Run kernel on GPU%d, taking source data from GPU%d and writing to "
      "GPU%d...\n",
      gpuid[0], gpuid[1], gpuid[0]);
  checkCudaErrors(cudaSetDevice(gpuid[0]));
  SimpleKernel<<<blocks, threads>>>(g1, g0);
  checkCudaErrors(cudaDeviceSynchronize());
  // Copy data back to host and verify
  printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
  checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
  int error_count = 0;
  for (int i = 0; i < buf_size / sizeof(float); i++) {
    // Re-generate input data and apply 2x '* 2.0f' computation of both
    // kernel runs
    if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
      printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
             (float(i % 4096) * 2.0f * 2.0f));
      if (error_count++ > 10) {
        break;
      }
    }
-  }
+    // Check possibility for peer access
    printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
-  // Disable peer access (also unregisters memory for non-UVA cases)
+    int can_access_peer;
-  printf("Disabling peer access...\n");
+    int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
+    p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
  checkCudaErrors(cudaSetDevice(gpuid[1]));
  checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
-  // Cleanup and shutdown
+    // Show all the combinations of supported P2P GPUs
-  printf("Shutting down...\n");
+    for (int i = 0; i < gpu_n; i++) {
-  checkCudaErrors(cudaEventDestroy(start_event));
+        for (int j = 0; j < gpu_n; j++) {
-  checkCudaErrors(cudaEventDestroy(stop_event));
+            if (i == j) {
-  checkCudaErrors(cudaSetDevice(gpuid[0]));
+                continue;
-  checkCudaErrors(cudaFree(g0));
+            }
-  checkCudaErrors(cudaSetDevice(gpuid[1]));
+            checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
-  checkCudaErrors(cudaFree(g1));
+            printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
-  checkCudaErrors(cudaFreeHost(h0));
+                   prop[i].name,
                   i,
                   prop[j].name,
                   j,
                   can_access_peer ? "Yes" : "No");
            if (can_access_peer && p2pCapableGPUs[0] == -1) {
                p2pCapableGPUs[0] = i;
                p2pCapableGPUs[1] = j;
            }
        }
    }
-  for (int i = 0; i < gpu_n; i++) {
+    if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
-    checkCudaErrors(cudaSetDevice(i));
+        printf("Two or more GPUs with Peer-to-Peer access capability are required for "
-  }
+               "%s.\n",
               argv[0]);
        printf("Peer to Peer access is not available amongst GPUs in the system, "
               "waiving test.\n");
-  if (error_count != 0) {
+        exit(EXIT_WAIVED);
-    printf("Test failed!\n");
+    }
-    exit(EXIT_FAILURE);
+
-  } else {
+    // Use first pair of p2p capable GPUs detected.
-    printf("Test passed\n");
+    gpuid[0] = p2pCapableGPUs[0];
-    exit(EXIT_SUCCESS);
+    gpuid[1] = p2pCapableGPUs[1];
-  }
+
    // Enable peer access
    printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
    // Allocate buffers
    const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
    printf(
        "Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    float *g0;
    checkCudaErrors(cudaMalloc(&g0, buf_size));
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    float *g1;
    checkCudaErrors(cudaMalloc(&g1, buf_size));
    float *h0;
    checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
    // Create CUDA event handles
    printf("Creating event handles...\n");
    cudaEvent_t start_event, stop_event;
    float       time_memcpy;
    int         eventflags = cudaEventBlockingSync;
    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
    // P2P memcopy() benchmark
    checkCudaErrors(cudaEventRecord(start_event, 0));
    for (int i = 0; i < 100; i++) {
        // With UVA we don't need to specify source and target devices, the
        // runtime figures this out by itself from the pointers
        // Ping-pong copy between GPUs
        if (i % 2 == 0) {
            checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
        }
        else {
            checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
        }
    }
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
           gpuid[0],
           gpuid[1],
           (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
    // Prepare host buffer and copy to GPU 0
    printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
    for (int i = 0; i < buf_size / sizeof(float); i++) {
        h0[i] = float(i % 4096);
    }
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
    // Kernel launch configuration
    const dim3 threads(512, 1);
    const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
    // Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
    // output to the GPU 1 buffer
    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
           "GPU%d...\n",
           gpuid[1],
           gpuid[0],
           gpuid[1]);
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    SimpleKernel<<<blocks, threads>>>(g0, g1);
    checkCudaErrors(cudaDeviceSynchronize());
    // Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
    // output to the GPU 0 buffer
    printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
           "GPU%d...\n",
           gpuid[0],
           gpuid[1],
           gpuid[0]);
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    SimpleKernel<<<blocks, threads>>>(g1, g0);
    checkCudaErrors(cudaDeviceSynchronize());
    // Copy data back to host and verify
    printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
    checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
    int error_count = 0;
    for (int i = 0; i < buf_size / sizeof(float); i++) {
        // Re-generate input data and apply 2x '* 2.0f' computation of both
        // kernel runs
        if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
            printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
            if (error_count++ > 10) {
                break;
            }
        }
    }
    // Disable peer access (also unregisters memory for non-UVA cases)
    printf("Disabling peer access...\n");
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
    // Cleanup and shutdown
    printf("Shutting down...\n");
    checkCudaErrors(cudaEventDestroy(start_event));
    checkCudaErrors(cudaEventDestroy(stop_event));
    checkCudaErrors(cudaSetDevice(gpuid[0]));
    checkCudaErrors(cudaFree(g0));
    checkCudaErrors(cudaSetDevice(gpuid[1]));
    checkCudaErrors(cudaFree(g1));
    checkCudaErrors(cudaFreeHost(h0));
    for (int i = 0; i < gpu_n; i++) {
        checkCudaErrors(cudaSetDevice(i));
    }
    if (error_count != 0) {
        printf("Test failed!\n");
        exit(EXIT_FAILURE);
    }
    else {
        printf("Test passed\n");
        exit(EXIT_SUCCESS);
    }
 }
--- a/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
+++ b/Samples/0_Introduction/simplePitchLinearTexture/simplePitchLinearTexture.cu
@ -26,16 +26,16 @@
 */
 /* pitchLinearTexture
-*
+ *
-* This example demonstrates how to use textures bound to pitch linear memory.
+ * This example demonstrates how to use textures bound to pitch linear memory.
-* It performs a shift of matrix elements using wrap addressing mode (aka
+ * It performs a shift of matrix elements using wrap addressing mode (aka
-* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
+ * periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
-* in order to highlight the differences in using each.
+ * in order to highlight the differences in using each.
-*
+ *
-* Textures binding to pitch linear memory is a new feature in CUDA 2.2,
+ * Textures binding to pitch linear memory is a new feature in CUDA 2.2,
-* and allows use of texture features such as wrap addressing mode and
+ * and allows use of texture features such as wrap addressing mode and
-* filtering which are not possible with textures bound to regular linear memory
+ * filtering which are not possible with textures bound to regular linear memory
-*/
+ */
 // includes, system
 #include <stdio.h>
@ -50,13 +50,13 @@
 #include <cuda_runtime.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check
-#define NUM_REPS 100  // number of repetitions performed
+#define NUM_REPS 100 // number of repetitions performed
-#define TILE_DIM 16   // tile/block size
+#define TILE_DIM 16  // tile/block size
 const char *sSDKsample = "simplePitchLinearTexture";
@ -70,29 +70,26 @@ bool bTestResult = true;
 //! Shifts matrix elements using pitch linear array
 //! @param odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
+__global__ void
-                                 int shiftX, int shiftY,
+shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
-                                 cudaTextureObject_t texRefPL) {
+{
-  int xid = blockIdx.x * blockDim.x + threadIdx.x;
+    int xid = blockIdx.x * blockDim.x + threadIdx.x;
-  int yid = blockIdx.y * blockDim.y + threadIdx.y;
+    int yid = blockIdx.y * blockDim.y + threadIdx.y;
-  odata[yid * pitch + xid] = tex2D<float>(
+    odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
      texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Shifts matrix elements using regular array
 //! @param odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void shiftArray(float *odata, int pitch, int width, int height,
+__global__ void
-                           int shiftX, int shiftY,
+shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
-                           cudaTextureObject_t texRefArray) {
+{
-  int xid = blockIdx.x * blockDim.x + threadIdx.x;
+    int xid = blockIdx.x * blockDim.x + threadIdx.x;
-  int yid = blockIdx.y * blockDim.y + threadIdx.y;
+    int yid = blockIdx.y * blockDim.y + threadIdx.y;
-  odata[yid * pitch + xid] =
+    odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
      tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
                   (yid + shiftY) / (float)height);
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -102,210 +99,199 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n\n", sSDKsample);
+{
    printf("%s starting...\n\n", sSDKsample);
-  runTest(argc, argv);
+    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sSDKsample,
+    printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
-         bTestResult ? "OK" : "ERROR!");
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  // Set array size
+{
-  const int nx = 2048;
+    // Set array size
-  const int ny = 2048;
+    const int nx = 2048;
    const int ny = 2048;
-  // Setup shifts applied to x and y data
+    // Setup shifts applied to x and y data
-  const int x_shift = 5;
+    const int x_shift = 5;
-  const int y_shift = 7;
+    const int y_shift = 7;
-  if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
+    if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
-    printf("nx and ny must be multiples of TILE_DIM\n");
+        printf("nx and ny must be multiples of TILE_DIM\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  // Setup execution configuration parameters
  dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
  // This will pick the best possible CUDA capable device
  int devID = findCudaDevice(argc, (const char **)argv);
  // CUDA events for timing
  cudaEvent_t start, stop;
  cudaEventCreate(&start);
  cudaEventCreate(&stop);
  // Host allocation and initialization
  float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
  float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
  float *gold = (float *)malloc(sizeof(float) * nx * ny);
  for (int i = 0; i < nx * ny; ++i) {
    h_idata[i] = (float)i;
  }
  // Device memory allocation
  // Pitch linear input data
  float *d_idataPL;
  size_t d_pitchBytes;
  checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
                                  nx * sizeof(float), ny));
  // Array input data
  cudaArray *d_idataArray;
  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
  checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
  // Pitch linear output data
  float *d_odata;
  checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
                                  nx * sizeof(float), ny));
  // Copy host data to device
  // Pitch linear
  size_t h_pitchBytes = nx * sizeof(float);
  checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
                               nx * sizeof(float), ny, cudaMemcpyHostToDevice));
  // Array
  checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
                                    nx * ny * sizeof(float),
                                    cudaMemcpyHostToDevice));
  cudaTextureObject_t texRefPL;
  cudaTextureObject_t texRefArray;
  cudaResourceDesc texRes;
  memset(&texRes, 0, sizeof(cudaResourceDesc));
  texRes.resType = cudaResourceTypePitch2D;
  texRes.res.pitch2D.devPtr = d_idataPL;
  texRes.res.pitch2D.desc = channelDesc;
  texRes.res.pitch2D.width = nx;
  texRes.res.pitch2D.height = ny;
  texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
  cudaTextureDesc texDescr;
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
  texDescr.normalizedCoords = true;
  texDescr.filterMode = cudaFilterModePoint;
  texDescr.addressMode[0] = cudaAddressModeWrap;
  texDescr.addressMode[1] = cudaAddressModeWrap;
  texDescr.readMode = cudaReadModeElementType;
  checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
  memset(&texRes, 0, sizeof(cudaResourceDesc));
  memset(&texDescr, 0, sizeof(cudaTextureDesc));
  texRes.resType = cudaResourceTypeArray;
  texRes.res.array.array = d_idataArray;
  texDescr.normalizedCoords = true;
  texDescr.filterMode = cudaFilterModePoint;
  texDescr.addressMode[0] = cudaAddressModeWrap;
  texDescr.addressMode[1] = cudaAddressModeWrap;
  texDescr.readMode = cudaReadModeElementType;
  checkCudaErrors(
      cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
  // Reference calculation
  for (int j = 0; j < ny; ++j) {
    int jshift = (j + y_shift) % ny;
    for (int i = 0; i < nx; ++i) {
      int ishift = (i + x_shift) % nx;
      gold[j * nx + i] = h_idata[jshift * nx + ishift];
    }
  }
-  // Run ShiftPitchLinear kernel
+    // Setup execution configuration parameters
-  checkCudaErrors(
+    dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
-  checkCudaErrors(cudaEventRecord(start, 0));
+    // This will pick the best possible CUDA capable device
    int devID = findCudaDevice(argc, (const char **)argv);
-  for (int i = 0; i < NUM_REPS; ++i) {
+    // CUDA events for timing
-    shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
+    cudaEvent_t start, stop;
-                                            (int)(d_pitchBytes / sizeof(float)),
+    cudaEventCreate(&start);
-                                            nx, ny, x_shift, y_shift, texRefPL);
+    cudaEventCreate(&stop);
  }
-  checkCudaErrors(cudaEventRecord(stop, 0));
+    // Host allocation and initialization
-  checkCudaErrors(cudaEventSynchronize(stop));
+    float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
-  float timePL;
+    float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
-  checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
+    float *gold    = (float *)malloc(sizeof(float) * nx * ny);
-  // Check results
+    for (int i = 0; i < nx * ny; ++i) {
-  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
+        h_idata[i] = (float)i;
-                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
+    }
-  bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
+    // Device memory allocation
    // Pitch linear input data
    float *d_idataPL;
    size_t d_pitchBytes;
-  bTestResult = true;
+    checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));
-  if (res == false) {
+    // Array input data
-    printf("*** shiftPitchLinear failed ***\n");
+    cudaArray            *d_idataArray;
-    bTestResult = false;
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
  }
-  // Run ShiftArray kernel
+    checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
  checkCudaErrors(
      cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
  checkCudaErrors(cudaEventRecord(start, 0));
-  for (int i = 0; i < NUM_REPS; ++i) {
+    // Pitch linear output data
-    shiftArray<<<dimGrid, dimBlock>>>(d_odata,
+    float *d_odata;
-                                      (int)(d_pitchBytes / sizeof(float)), nx,
+    checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));
                                      ny, x_shift, y_shift, texRefArray);
  }
-  checkCudaErrors(cudaEventRecord(stop, 0));
+    // Copy host data to device
-  checkCudaErrors(cudaEventSynchronize(stop));
+    // Pitch linear
-  float timeArray;
+    size_t h_pitchBytes = nx * sizeof(float);
  checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
-  // Check results
+    checkCudaErrors(
-  checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
+        cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));
                               nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
  res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
-  if (res == false) {
+    // Array
-    printf("*** shiftArray failed ***\n");
+    checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));
    bTestResult = false;
  }
-  float bandwidthPL =
+    cudaTextureObject_t texRefPL;
-      2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
+    cudaTextureObject_t texRefArray;
-  float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
+    cudaResourceDesc    texRes;
-                         (timeArray / NUM_REPS);
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
-  printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
+    texRes.resType                  = cudaResourceTypePitch2D;
-         bandwidthPL, bandwidthArray);
+    texRes.res.pitch2D.devPtr       = d_idataPL;
    texRes.res.pitch2D.desc         = channelDesc;
    texRes.res.pitch2D.width        = nx;
    texRes.res.pitch2D.height       = ny;
    texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
    cudaTextureDesc texDescr;
    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-  float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
+    texDescr.normalizedCoords = true;
-  float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
+    texDescr.filterMode       = cudaFilterModePoint;
    texDescr.addressMode[0]   = cudaAddressModeWrap;
    texDescr.addressMode[1]   = cudaAddressModeWrap;
    texDescr.readMode         = cudaReadModeElementType;
-  printf(
+    checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
-      "\nTexture fetch rate (Mpix/s) for pitch linear: "
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
-      "%.2e; for array: %.2e\n\n",
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-      fetchRatePL, fetchRateArray);
+    texRes.resType            = cudaResourceTypeArray;
    texRes.res.array.array    = d_idataArray;
    texDescr.normalizedCoords = true;
    texDescr.filterMode       = cudaFilterModePoint;
    texDescr.addressMode[0]   = cudaAddressModeWrap;
    texDescr.addressMode[1]   = cudaAddressModeWrap;
    texDescr.readMode         = cudaReadModeElementType;
    checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
-  // Cleanup
+    // Reference calculation
-  free(h_idata);
+    for (int j = 0; j < ny; ++j) {
-  free(h_odata);
+        int jshift = (j + y_shift) % ny;
  free(gold);
-  checkCudaErrors(cudaDestroyTextureObject(texRefPL));
+        for (int i = 0; i < nx; ++i) {
-  checkCudaErrors(cudaDestroyTextureObject(texRefArray));
+            int ishift       = (i + x_shift) % nx;
-  checkCudaErrors(cudaFree(d_idataPL));
+            gold[j * nx + i] = h_idata[jshift * nx + ishift];
-  checkCudaErrors(cudaFreeArray(d_idataArray));
+        }
-  checkCudaErrors(cudaFree(d_odata));
+    }
-  checkCudaErrors(cudaEventDestroy(start));
+    // Run ShiftPitchLinear kernel
-  checkCudaErrors(cudaEventDestroy(stop));
+    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
    checkCudaErrors(cudaEventRecord(start, 0));
    for (int i = 0; i < NUM_REPS; ++i) {
        shiftPitchLinear<<<dimGrid, dimBlock>>>(
            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaEventSynchronize(stop));
    float timePL;
    checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
    // Check results
    checkCudaErrors(
        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
    bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
    bTestResult = true;
    if (res == false) {
        printf("*** shiftPitchLinear failed ***\n");
        bTestResult = false;
    }
    // Run ShiftArray kernel
    checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
    checkCudaErrors(cudaEventRecord(start, 0));
    for (int i = 0; i < NUM_REPS; ++i) {
        shiftArray<<<dimGrid, dimBlock>>>(
            d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaEventSynchronize(stop));
    float timeArray;
    checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
    // Check results
    checkCudaErrors(
        cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
    res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
    if (res == false) {
        printf("*** shiftArray failed ***\n");
        bTestResult = false;
    }
    float bandwidthPL    = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
    float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
    printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
    float fetchRatePL    = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
    float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
    printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
           "%.2e; for array: %.2e\n\n",
           fetchRatePL,
           fetchRateArray);
    // Cleanup
    free(h_idata);
    free(h_odata);
    free(gold);
    checkCudaErrors(cudaDestroyTextureObject(texRefPL));
    checkCudaErrors(cudaDestroyTextureObject(texRefArray));
    checkCudaErrors(cudaFree(d_idataPL));
    checkCudaErrors(cudaFreeArray(d_idataArray));
    checkCudaErrors(cudaFree(d_odata));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
 }
--- a/Samples/0_Introduction/simplePrintf/simplePrintf.cu
+++ b/Samples/0_Introduction/simplePrintf/simplePrintf.cu
@ -26,48 +26,49 @@
 */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
 #endif
-__global__ void testKernel(int val) {
+__global__ void testKernel(int val)
-  printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x,
+{
-         threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
+    printf("[%d, %d]:\t\tValue is:%d\n",
-             threadIdx.x,
+           blockIdx.y * gridDim.x + blockIdx.x,
-         val);
+           threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
           val);
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  int devID;
+{
-  cudaDeviceProp props;
+    int            devID;
    cudaDeviceProp props;
-  // This will pick the best possible CUDA capable device
+    // This will pick the best possible CUDA capable device
-  devID = findCudaDevice(argc, (const char **)argv);
+    devID = findCudaDevice(argc, (const char **)argv);
-  // Get GPU information
+    // Get GPU information
-  checkCudaErrors(cudaGetDevice(&devID));
+    checkCudaErrors(cudaGetDevice(&devID));
-  checkCudaErrors(cudaGetDeviceProperties(&props, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&props, devID));
-  printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
+    printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);
         props.major, props.minor);
-  printf("printf() is called. Output:\n\n");
+    printf("printf() is called. Output:\n\n");
-  // Kernel configuration, where a two-dimensional grid and
+    // Kernel configuration, where a two-dimensional grid and
-  // three-dimensional blocks are configured.
+    // three-dimensional blocks are configured.
-  dim3 dimGrid(2, 2);
+    dim3 dimGrid(2, 2);
-  dim3 dimBlock(2, 2, 2);
+    dim3 dimBlock(2, 2, 2);
-  testKernel<<<dimGrid, dimBlock>>>(10);
+    testKernel<<<dimGrid, dimBlock>>>(10);
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
--- a/Samples/0_Introduction/simpleStreams/simpleStreams.cu
+++ b/Samples/0_Introduction/simpleStreams/simpleStreams.cu
@ -44,141 +44,137 @@
 *
 * Elapsed times are averaged over nreps repetitions (10 by default).
 *
-*/
+ */
 const char *sSDKsample = "simpleStreams";
-const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
+const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};
                                  "cudaEventDisableTiming", NULL};
-const char *sDeviceSyncMethod[] = {
+const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
-    "cudaDeviceScheduleAuto",         "cudaDeviceScheduleSpin",
+                                   "cudaDeviceScheduleSpin",
-    "cudaDeviceScheduleYield",        "INVALID",
+                                   "cudaDeviceScheduleYield",
-    "cudaDeviceScheduleBlockingSync", NULL};
+                                   "INVALID",
                                   "cudaDeviceScheduleBlockingSync",
                                   NULL};
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <stdio.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef WIN32
-#include <sys/mman.h>  // for mmap() / munmap()
+#include <sys/mman.h> // for mmap() / munmap()
 #endif
 // Macro to aligned up to the memory size in question
-#define MEMORY_ALIGNMENT 4096
+#define MEMORY_ALIGNMENT  4096
 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
-__global__ void init_array(int *g_data, int *factor, int num_iterations) {
+__global__ void init_array(int *g_data, int *factor, int num_iterations)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = 0; i < num_iterations; i++) {
+    for (int i = 0; i < num_iterations; i++) {
-    g_data[idx] += *factor;  // non-coalesced on purpose, to burn time
+        g_data[idx] += *factor; // non-coalesced on purpose, to burn time
  }
 }
 bool correct_data(int *a, const int n, const int c) {
  for (int i = 0; i < n; i++) {
    if (a[i] != c) {
      printf("%d: %d %d\n", i, a[i], c);
      return false;
    }
  }
  return true;
 }
-inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
+bool correct_data(int *a, const int n, const int c)
-                               int **ppAligned_a, int nbytes) {
+{
    for (int i = 0; i < n; i++) {
        if (a[i] != c) {
            printf("%d: %d %d\n", i, a[i], c);
            return false;
        }
    }
    return true;
 }
 inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
 {
 #if CUDART_VERSION >= 4000
 #if !defined(__arm__) && !defined(__aarch64__)
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
 // allocate a generic page-aligned chunk of system memory
 #ifdef WIN32
-    printf(
+        printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
-        "> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
+               "system memory)\n",
-        "system memory)\n",
+               (float)nbytes / 1048576.0f);
-        (float)nbytes / 1048576.0f);
+        *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
    *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
                                MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 #else
-    printf(
+        printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
-        "> mmap() allocating %4.2f Mbytes (generic page-aligned system "
+               "memory)\n",
-        "memory)\n",
+               (float)nbytes / 1048576.0f);
-        (float)nbytes / 1048576.0f);
+        *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
    *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
                        PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
 #endif
-    *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
+        *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
-    printf(
+        printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
-        "> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
+               "system memory\n",
-        "system memory\n",
+               (float)nbytes / 1048576.0f);
-        (float)nbytes / 1048576.0f);
+        // pin allocate memory
-    // pin allocate memory
+        checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
-    checkCudaErrors(
+    }
-        cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
+    else
  } else
 #endif
 #endif
-  {
+    {
-    printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
+        printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
-           (float)nbytes / 1048576.0f);
+        // allocate host memory (pinned is required for achieve asynchronicity)
-    // allocate host memory (pinned is required for achieve asynchronicity)
+        checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
-    checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
+        *ppAligned_a = *pp_a;
-    *ppAligned_a = *pp_a;
+    }
  }
 }
-inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
+inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
-                           int **ppAligned_a, int nbytes) {
+{
 #if CUDART_VERSION >= 4000
 #if !defined(__arm__) && !defined(__aarch64__)
-  // CUDA 4.0 support pinning of generic host memory
+    // CUDA 4.0 support pinning of generic host memory
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
-    // unpin and delete host memory
+        // unpin and delete host memory
-    checkCudaErrors(cudaHostUnregister(*ppAligned_a));
+        checkCudaErrors(cudaHostUnregister(*ppAligned_a));
 #ifdef WIN32
-    VirtualFree(*pp_a, 0, MEM_RELEASE);
+        VirtualFree(*pp_a, 0, MEM_RELEASE);
 #else
-    munmap(*pp_a, nbytes);
+        munmap(*pp_a, nbytes);
 #endif
-  } else
+    }
    else
 #endif
 #endif
-  {
+    {
-    cudaFreeHost(*pp_a);
+        cudaFreeHost(*pp_a);
-  }
+    }
 }
-static const char *sSyncMethod[] = {
+static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
-    "0 (Automatic Blocking)",
+                                    "1 (Spin Blocking)",
-    "1 (Spin Blocking)",
+                                    "2 (Yield Blocking)",
-    "2 (Yield Blocking)",
+                                    "3 (Undefined Blocking Method)",
-    "3 (Undefined Blocking Method)",
+                                    "4 (Blocking Sync Event) = low CPU utilization",
-    "4 (Blocking Sync Event) = low CPU utilization",
+                                    NULL};
    NULL};
-void printHelp() {
+void printHelp()
-  printf("Usage: %s [options below]\n", sSDKsample);
+{
-  printf("\t--sync_method=n for CPU/GPU synchronization\n");
+    printf("Usage: %s [options below]\n", sSDKsample);
-  printf("\t             n=%s\n", sSyncMethod[0]);
+    printf("\t--sync_method=n for CPU/GPU synchronization\n");
-  printf("\t             n=%s\n", sSyncMethod[1]);
+    printf("\t             n=%s\n", sSyncMethod[0]);
-  printf("\t             n=%s\n", sSyncMethod[2]);
+    printf("\t             n=%s\n", sSyncMethod[1]);
-  printf("\t   <Default> n=%s\n", sSyncMethod[4]);
+    printf("\t             n=%s\n", sSyncMethod[2]);
-  printf(
+    printf("\t   <Default> n=%s\n", sSyncMethod[4]);
-      "\t--use_generic_memory (default) use generic page-aligned for system "
+    printf("\t--use_generic_memory (default) use generic page-aligned for system "
-      "memory\n");
+           "memory\n");
-  printf(
+    printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
-      "\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
+           "system memory\n");
      "system memory\n");
 }
 #if defined(__APPLE__) || defined(MACOSX)
@ -187,259 +183,240 @@ void printHelp() {
 #define DEFAULT_PINNED_GENERIC_MEMORY true
 #endif
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  int cuda_device = 0;
+{
-  int nstreams = 4;              // number of streams for CUDA calls
+    int   cuda_device = 0;
-  int nreps = 10;                // number of times each experiment is repeated
+    int   nstreams    = 4;                        // number of streams for CUDA calls
-  int n = 16 * 1024 * 1024;      // number of ints in the data set
+    int   nreps       = 10;                       // number of times each experiment is repeated
-  int nbytes = n * sizeof(int);  // number of data bytes
+    int   n           = 16 * 1024 * 1024;         // number of ints in the data set
-  dim3 threads, blocks;          // kernel launch configuration
+    int   nbytes      = n * sizeof(int);          // number of data bytes
-  float elapsed_time, time_memcpy, time_kernel;  // timing variables
+    dim3  threads, blocks;                        // kernel launch configuration
-  float scale_factor = 1.0f;
+    float elapsed_time, time_memcpy, time_kernel; // timing variables
    float scale_factor = 1.0f;
-  // allocate generic memory and pin it laster instead of using cudaHostAlloc()
+    // allocate generic memory and pin it laster instead of using cudaHostAlloc()
-  bool bPinGenericMemory =
+    bool bPinGenericMemory  = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
-      DEFAULT_PINNED_GENERIC_MEMORY;  // we want this to be the default behavior
+    int  device_sync_method = cudaDeviceBlockingSync;        // by default we use BlockingSync
  int device_sync_method =
      cudaDeviceBlockingSync;  // by default we use BlockingSync
-  int niterations;  // number of iterations for the loop inside the kernel
+    int niterations; // number of iterations for the loop inside the kernel
-  printf("[ %s ]\n\n", sSDKsample);
+    printf("[ %s ]\n\n", sSDKsample);
-  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
-    printHelp();
+        printHelp();
-    return EXIT_SUCCESS;
+        return EXIT_SUCCESS;
  }
  if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
                                                  "sync_method")) >= 0) {
    if (device_sync_method == 0 || device_sync_method == 1 ||
        device_sync_method == 2 || device_sync_method == 4) {
      printf("Device synchronization method set to = %s\n",
             sSyncMethod[device_sync_method]);
      printf("Setting reps to 100 to demonstrate steady state\n");
      nreps = 100;
    } else {
      printf("Invalid command line option sync_method=\"%d\"\n",
             device_sync_method);
      return EXIT_FAILURE;
    }
  } else {
    printHelp();
    return EXIT_SUCCESS;
  }
-  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
+    if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
        if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
            printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
            printf("Setting reps to 100 to demonstrate steady state\n");
            nreps = 100;
        }
        else {
            printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
            return EXIT_FAILURE;
        }
    }
    else {
        printHelp();
        return EXIT_SUCCESS;
    }
    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 #if defined(__APPLE__) || defined(MACOSX)
-    bPinGenericMemory = false;  // Generic Pinning of System Paged memory not
+        bPinGenericMemory = false; // Generic Pinning of System Paged memory not
-                                // currently supported on Mac OSX
+                                   // currently supported on Mac OSX
 #else
-    bPinGenericMemory = true;
+        bPinGenericMemory = true;
 #endif
  }
  if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
    bPinGenericMemory = false;
  }
  printf("\n> ");
  cuda_device = findCudaDevice(argc, (const char **)argv);
  // check the compute capability of the device
  int num_devices = 0;
  checkCudaErrors(cudaGetDeviceCount(&num_devices));
  if (0 == num_devices) {
    printf(
        "your system does not have a CUDA capable device, waiving test...\n");
    return EXIT_WAIVED;
  }
  // check if the command-line chosen device ID is within range, exit if not
  if (cuda_device >= num_devices) {
    printf(
        "cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
        cuda_device, num_devices - 1);
    return EXIT_FAILURE;
  }
  checkCudaErrors(cudaSetDevice(cuda_device));
  // Checking for compute capabilities
  cudaDeviceProp deviceProp;
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  niterations = 5;
  // Check if GPU can map host memory (Generic Method), if not then we override
  // bPinGenericMemory to be false
  if (bPinGenericMemory) {
    printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
           deviceProp.canMapHostMemory ? "Yes" : "No");
    if (deviceProp.canMapHostMemory == 0) {
      printf(
          "Using cudaMallocHost, CUDA device does not support mapping of "
          "generic host memory\n");
      bPinGenericMemory = false;
    }
  }
-  // Anything that is less than 32 Cores will have scaled down workload
+    if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
-  scale_factor =
+        bPinGenericMemory = false;
-      max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+    }
                    (float)deviceProp.multiProcessorCount)),
          1.0f);
  n = (int)rint((float)n / scale_factor);
-  printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
+    printf("\n> ");
-         deviceProp.minor);
+    cuda_device = findCudaDevice(argc, (const char **)argv);
  printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
         deviceProp.multiProcessorCount,
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
             deviceProp.multiProcessorCount);
-  printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
+    // check the compute capability of the device
-  printf("> array_size   = %d\n\n", n);
+    int num_devices = 0;
    checkCudaErrors(cudaGetDeviceCount(&num_devices));
-  // enable use of blocking sync, to reduce CPU usage
+    if (0 == num_devices) {
-  printf("> Using CPU/GPU Device Synchronization method (%s)\n",
+        printf("your system does not have a CUDA capable device, waiving test...\n");
-         sDeviceSyncMethod[device_sync_method]);
+        return EXIT_WAIVED;
-  checkCudaErrors(cudaSetDeviceFlags(
+    }
      device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
-  // allocate host memory
+    // check if the command-line chosen device ID is within range, exit if not
-  int c = 5;            // value to which the array will be initialized
+    if (cuda_device >= num_devices) {
-  int *h_a = 0;         // pointer to the array data in host memory
+        printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
-  int *hAligned_a = 0;  // pointer to the array data in host memory (aligned to
+        return EXIT_FAILURE;
-                        // MEMORY_ALIGNMENT)
+    }
-  // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
+    checkCudaErrors(cudaSetDevice(cuda_device));
  // using the new CUDA 4.0 features
  AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
-  // allocate device memory
+    // Checking for compute capabilities
-  int *d_a = 0,
+    cudaDeviceProp deviceProp;
-      *d_c = 0;  // pointers to data and init value in the device memory
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
  checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
  checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
  checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
  checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
-  printf("\nStarting Test\n");
+    niterations = 5;
-  // allocate and initialize an array of stream handles
+    // Check if GPU can map host memory (Generic Method), if not then we override
-  cudaStream_t *streams =
+    // bPinGenericMemory to be false
-      (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
+    if (bPinGenericMemory) {
        printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
-  for (int i = 0; i < nstreams; i++) {
+        if (deviceProp.canMapHostMemory == 0) {
-    checkCudaErrors(cudaStreamCreate(&(streams[i])));
+            printf("Using cudaMallocHost, CUDA device does not support mapping of "
-  }
+                   "generic host memory\n");
            bPinGenericMemory = false;
        }
    }
-  // create CUDA event handles
+    // Anything that is less than 32 Cores will have scaled down workload
-  // use blocking sync
+    scale_factor =
-  cudaEvent_t start_event, stop_event;
+        max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
-  int eventflags =
+            1.0f);
-      ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
+    n = (int)rint((float)n / scale_factor);
                                                      : cudaEventDefault);
-  checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
+    printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
-  checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
+    printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
-  // time memcopy from device
+    printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
-  checkCudaErrors(cudaEventRecord(start_event, 0));  // record in stream-0, to
+    printf("> array_size   = %d\n\n", n);
                                                     // ensure that all previous
                                                     // CUDA calls have
                                                     // completed
  checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
                                  cudaMemcpyDeviceToHost, streams[0]));
  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(
      stop_event));  // block until the event is actually recorded
  checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
  printf("memcopy:\t%.2f\n", time_memcpy);
-  // time kernel
+    // enable use of blocking sync, to reduce CPU usage
-  threads = dim3(512, 1);
+    printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
-  blocks = dim3(n / threads.x, 1);
+    checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
  checkCudaErrors(cudaEventRecord(start_event, 0));
  init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
  checkCudaErrors(cudaEventRecord(stop_event, 0));
  checkCudaErrors(cudaEventSynchronize(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
  printf("kernel:\t\t%.2f\n", time_kernel);
-  //////////////////////////////////////////////////////////////////////
+    // allocate host memory
-  // time non-streamed execution for reference
+    int  c          = 5; // value to which the array will be initialized
-  threads = dim3(512, 1);
+    int *h_a        = 0; // pointer to the array data in host memory
-  blocks = dim3(n / threads.x, 1);
+    int *hAligned_a = 0; // pointer to the array data in host memory (aligned to
-  checkCudaErrors(cudaEventRecord(start_event, 0));
+                         // MEMORY_ALIGNMENT)
-  for (int k = 0; k < nreps; k++) {
+    // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
-    init_array<<<blocks, threads>>>(d_a, d_c, niterations);
+    // using the new CUDA 4.0 features
-    checkCudaErrors(
+    AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
        cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
  }
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
+    // allocate device memory
-  checkCudaErrors(cudaEventSynchronize(stop_event));
+    int *d_a = 0,
-  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
+        *d_c = 0; // pointers to data and init value in the device memory
-  printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
+    checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
    checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
    checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
    checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
-  //////////////////////////////////////////////////////////////////////
+    printf("\nStarting Test\n");
-  // time execution with nstreams streams
+
-  threads = dim3(512, 1);
+    // allocate and initialize an array of stream handles
-  blocks = dim3(n / (nstreams * threads.x), 1);
+    cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
  memset(hAligned_a, 255,
         nbytes);  // set host memory bits to all 1s, for testing correctness
  checkCudaErrors(cudaMemset(
      d_a, 0, nbytes));  // set device memory to all 0s, for testing correctness
  checkCudaErrors(cudaEventRecord(start_event, 0));
  for (int k = 0; k < nreps; k++) {
    // asynchronously launch nstreams kernels, each operating on its own portion
    // of data
    for (int i = 0; i < nstreams; i++) {
-      init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
+        checkCudaErrors(cudaStreamCreate(&(streams[i])));
                                                     d_c, niterations);
    }
-    // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
+    // create CUDA event handles
-    // will only
+    // use blocking sync
-    //   commence executing when all previous CUDA calls in stream x have
+    cudaEvent_t start_event, stop_event;
-    //   completed
+    int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
    checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
    checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
    // time memcopy from device
    checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to
                                                      // ensure that all previous
                                                      // CUDA calls have
                                                      // completed
    checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
    checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
    printf("memcopy:\t%.2f\n", time_memcpy);
    // time kernel
    threads = dim3(512, 1);
    blocks  = dim3(n / threads.x, 1);
    checkCudaErrors(cudaEventRecord(start_event, 0));
    init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
    printf("kernel:\t\t%.2f\n", time_kernel);
    //////////////////////////////////////////////////////////////////////
    // time non-streamed execution for reference
    threads = dim3(512, 1);
    blocks  = dim3(n / threads.x, 1);
    checkCudaErrors(cudaEventRecord(start_event, 0));
    for (int k = 0; k < nreps; k++) {
        init_array<<<blocks, threads>>>(d_a, d_c, niterations);
        checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
    }
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
    printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
    //////////////////////////////////////////////////////////////////////
    // time execution with nstreams streams
    threads = dim3(512, 1);
    blocks  = dim3(n / (nstreams * threads.x), 1);
    memset(hAligned_a, 255,
           nbytes);                              // set host memory bits to all 1s, for testing correctness
    checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
    checkCudaErrors(cudaEventRecord(start_event, 0));
    for (int k = 0; k < nreps; k++) {
        // asynchronously launch nstreams kernels, each operating on its own portion
        // of data
        for (int i = 0; i < nstreams; i++) {
            init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
        }
        // asynchronously launch nstreams memcopies.  Note that memcopy in stream x
        // will only
        //   commence executing when all previous CUDA calls in stream x have
        //   completed
        for (int i = 0; i < nstreams; i++) {
            checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
                                            d_a + i * n / nstreams,
                                            nbytes / nstreams,
                                            cudaMemcpyDeviceToHost,
                                            streams[i]));
        }
    }
    checkCudaErrors(cudaEventRecord(stop_event, 0));
    checkCudaErrors(cudaEventSynchronize(stop_event));
    checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
    printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
    // check whether the output is correct
    printf("-------------------------------\n");
    bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
    // release resources
    for (int i = 0; i < nstreams; i++) {
-      checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
+        checkCudaErrors(cudaStreamDestroy(streams[i]));
                                      d_a + i * n / nstreams, nbytes / nstreams,
                                      cudaMemcpyDeviceToHost, streams[i]));
    }
  }
-  checkCudaErrors(cudaEventRecord(stop_event, 0));
+    checkCudaErrors(cudaEventDestroy(start_event));
-  checkCudaErrors(cudaEventSynchronize(stop_event));
+    checkCudaErrors(cudaEventDestroy(stop_event));
  checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
  printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
-  // check whether the output is correct
+    // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
-  printf("-------------------------------\n");
+    FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
  bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
-  // release resources
+    checkCudaErrors(cudaFree(d_a));
-  for (int i = 0; i < nstreams; i++) {
+    checkCudaErrors(cudaFree(d_c));
    checkCudaErrors(cudaStreamDestroy(streams[i]));
  }
-  checkCudaErrors(cudaEventDestroy(start_event));
+    return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
  checkCudaErrors(cudaEventDestroy(stop_event));
  // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
  FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
  checkCudaErrors(cudaFree(d_a));
  checkCudaErrors(cudaFree(d_c));
  return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
 }
--- a/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu
+++ b/Samples/0_Introduction/simpleSurfaceWrite/simpleSurfaceWrite.cu
@ -34,10 +34,10 @@
 */
 // Includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -49,18 +49,18 @@
 #include <cuda_runtime.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check
 #define MIN_EPSILON_ERROR 5e-3f
 ////////////////////////////////////////////////////////////////////////////////
 // Define the files that are to be save and the reference images for validation
 const char *imageFilename = "teapot512.pgm";
-const char *refFilename = "ref_rotated.pgm";
+const char *refFilename   = "ref_rotated.pgm";
-float angle = 0.5f;  // angle to rotate image by (in radians)
+float       angle         = 0.5f; // angle to rotate image by (in radians)
 // Auto-Verification Code
 bool testResult = true;
@ -73,223 +73,218 @@ static const char *sampleName = "simpleSurfaceWrite";
 //! Write to a cuArray (texture data source) using surface writes
 //! @param gIData input data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void surfaceWriteKernel(float *gIData, int width, int height,
+__global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
-                                   cudaSurfaceObject_t outputSurface) {
+{
-  // calculate surface coordinates
+    // calculate surface coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-  // read from global memory and write to cuarray (via surface reference)
+    // read from global memory and write to cuarray (via surface reference)
-  surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y,
+    surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
              cudaBoundaryModeTrap);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Transform an image using texture lookups
 //! @param gOData  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *gOData, int width, int height,
+__global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
-                                float theta, cudaTextureObject_t tex) {
+{
-  // calculate normalized texture coordinates
+    // calculate normalized texture coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-  float u = x / (float)width;
+    float u = x / (float)width;
-  float v = y / (float)height;
+    float v = y / (float)height;
-  // transform coordinates
+    // transform coordinates
-  u -= 0.5f;
+    u -= 0.5f;
-  v -= 0.5f;
+    v -= 0.5f;
-  float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
+    float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
-  float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
+    float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
-  // read from texture and write to global memory
+    // read from texture and write to global memory
-  gOData[y * width + x] = tex2D<float>(tex, tu, tv);
+    gOData[y * width + x] = tex2D<float>(tex, tu, tv);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Declaration, forward
 void runTest(int argc, char **argv);
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", sampleName);
+{
    printf("%s starting...\n", sampleName);
-  // Process command-line arguments
+    // Process command-line arguments
-  if (argc > 1) {
+    if (argc > 1) {
-    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
+        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
-      getCmdLineArgumentString(argc, (const char **)argv, "input",
+            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
                               (char **)&imageFilename);
-      if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-        getCmdLineArgumentString(argc, (const char **)argv, "reference",
+                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
-                                 (char **)&refFilename);
+            }
-      } else {
+            else {
-        printf("-input flag should be used with -reference flag");
+                printf("-input flag should be used with -reference flag");
-        exit(EXIT_FAILURE);
+                exit(EXIT_FAILURE);
-      }
+            }
-    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+        }
-      printf("-reference flag should be used with -input flag");
+        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-      exit(EXIT_FAILURE);
+            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
    }
  }
-  runTest(argc, argv);
+    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
-         testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  // Use command-line specified CUDA device,
+{
-  // otherwise use device with highest Gflops/s
+    // Use command-line specified CUDA device,
-  int devID = findCudaDevice(argc, (const char **)argv);
+    // otherwise use device with highest Gflops/s
    int devID = findCudaDevice(argc, (const char **)argv);
-  // Get number of SMs on this GPU
+    // Get number of SMs on this GPU
-  cudaDeviceProp deviceProps;
+    cudaDeviceProp deviceProps;
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
+    printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
-         deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major,
+           deviceProps.name,
-         deviceProps.minor);
+           deviceProps.multiProcessorCount,
           deviceProps.major,
           deviceProps.minor);
-  // Load image from disk
+    // Load image from disk
-  float *hData = NULL;
+    float       *hData = NULL;
-  unsigned int width, height;
+    unsigned int width, height;
-  char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
+    char        *imagePath = sdkFindFilePath(imageFilename, argv[0]);
-  if (imagePath == NULL) {
+    if (imagePath == NULL) {
-    printf("Unable to source image input file: %s\n", imageFilename);
+        printf("Unable to source image input file: %s\n", imageFilename);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  sdkLoadPGM(imagePath, &hData, &width, &height);
+    sdkLoadPGM(imagePath, &hData, &width, &height);
-  unsigned int size = width * height * sizeof(float);
+    unsigned int size = width * height * sizeof(float);
-  printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
+    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
-  // Load reference image from image (output)
+    // Load reference image from image (output)
-  float *hDataRef = (float *)malloc(size);
+    float *hDataRef = (float *)malloc(size);
-  char *refPath = sdkFindFilePath(refFilename, argv[0]);
+    char  *refPath  = sdkFindFilePath(refFilename, argv[0]);
-  if (refPath == NULL) {
+    if (refPath == NULL) {
-    printf("Unable to find reference image file: %s\n", refFilename);
+        printf("Unable to find reference image file: %s\n", refFilename);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  sdkLoadPGM(refPath, &hDataRef, &width, &height);
+    sdkLoadPGM(refPath, &hDataRef, &width, &height);
-  // Allocate device memory for result
+    // Allocate device memory for result
-  float *dData = NULL;
+    float *dData = NULL;
-  checkCudaErrors(cudaMalloc((void **)&dData, size));
+    checkCudaErrors(cudaMalloc((void **)&dData, size));
-  // Allocate array and copy image data
+    // Allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    cudaArray            *cuArray;
-  cudaArray *cuArray;
+    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));
  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
                                  cudaArraySurfaceLoadStore));
-  dim3 dimBlock(8, 8, 1);
+    dim3 dimBlock(8, 8, 1);
-  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
+    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
-  cudaSurfaceObject_t outputSurface;
+    cudaSurfaceObject_t outputSurface;
-  cudaResourceDesc surfRes;
+    cudaResourceDesc    surfRes;
-  memset(&surfRes, 0, sizeof(cudaResourceDesc));
+    memset(&surfRes, 0, sizeof(cudaResourceDesc));
-  surfRes.resType = cudaResourceTypeArray;
+    surfRes.resType         = cudaResourceTypeArray;
-  surfRes.res.array.array = cuArray;
+    surfRes.res.array.array = cuArray;
-  checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
+    checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
 #if 1
-  checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
-  surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
+    surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
-                                            outputSurface);
+#else // This is what differs from the example simpleTexture
-#else  // This is what differs from the example simpleTexture
+    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
  checkCudaErrors(
      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
 #endif
-  cudaTextureObject_t tex;
+    cudaTextureObject_t tex;
-  cudaResourceDesc texRes;
+    cudaResourceDesc    texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
-  texRes.resType = cudaResourceTypeArray;
+    texRes.resType         = cudaResourceTypeArray;
-  texRes.res.array.array = cuArray;
+    texRes.res.array.array = cuArray;
-  cudaTextureDesc texDescr;
+    cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-  texDescr.normalizedCoords = true;
+    texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModeLinear;
+    texDescr.filterMode       = cudaFilterModeLinear;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
+    texDescr.readMode         = cudaReadModeElementType;
-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
-  // Warmup
+    // Warmup
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  StopWatchInterface *timer = NULL;
+    StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  // Execute the kernel
+    // Execute the kernel
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
-  // Check if kernel execution generated an error
+    // Check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
+    getLastCudaError("Kernel execution failed");
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
-         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+    sdkDeleteTimer(&timer);
  sdkDeleteTimer(&timer);
-  // Allocate mem for the result on host side
+    // Allocate mem for the result on host side
-  float *hOData = (float *)malloc(size);
+    float *hOData = (float *)malloc(size);
-  // copy result from device to host
+    // copy result from device to host
-  checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
-  // Write result to file
+    // Write result to file
-  char outputFilename[1024];
+    char outputFilename[1024];
-  strcpy(outputFilename, "output.pgm");
+    strcpy(outputFilename, "output.pgm");
-  sdkSavePGM("output.pgm", hOData, width, height);
+    sdkSavePGM("output.pgm", hOData, width, height);
-  printf("Wrote '%s'\n", outputFilename);
+    printf("Wrote '%s'\n", outputFilename);
-  // Write regression file if necessary
+    // Write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // Write file for regression test
+        // Write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f,
+        sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
-                        false);
+    }
-  } else {
+    else {
-    // We need to reload the data from disk,
+        // We need to reload the data from disk,
-    // because it is inverted upon output
+        // because it is inverted upon output
-    sdkLoadPGM(outputFilename, &hOData, &width, &height);
+        sdkLoadPGM(outputFilename, &hOData, &width, &height);
-    printf("Comparing files\n");
+        printf("Comparing files\n");
-    printf("\toutput:    <%s>\n", outputFilename);
+        printf("\toutput:    <%s>\n", outputFilename);
-    printf("\treference: <%s>\n", refPath);
+        printf("\treference: <%s>\n", refPath);
-    testResult =
+        testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
-        compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
+    }
  }
-  checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
+    checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
-  checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(dData));
+    checkCudaErrors(cudaFree(dData));
-  checkCudaErrors(cudaFreeArray(cuArray));
+    checkCudaErrors(cudaFreeArray(cuArray));
-  free(imagePath);
+    free(imagePath);
-  free(refPath);
+    free(refPath);
 }
--- a/Samples/0_Introduction/simpleTemplates/sharedmem.cuh
+++ b/Samples/0_Introduction/simpleTemplates/sharedmem.cuh
@ -68,106 +68,118 @@
 // this
 // struct by putting an undefined symbol in the function body so it won't
 // compile.
-template <typename T>
+template <typename T> struct SharedMemory
-struct SharedMemory {
+{
-  // Ensure that we won't compile any un-specialized types
+    // Ensure that we won't compile any un-specialized types
-  __device__ T *getPointer() {
+    __device__ T *getPointer()
-    extern __device__ void error(void);
+    {
-    error();
+        extern __device__ void error(void);
-    return NULL;
+        error();
-  }
+        return NULL;
    }
 };
 // Following are the specializations for the following types.
 // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
 // One could also specialize it for user-defined types.
-template <>
+template <> struct SharedMemory<int>
-struct SharedMemory<int> {
+{
-  __device__ int *getPointer() {
+    __device__ int *getPointer()
-    extern __shared__ int s_int[];
+    {
-    return s_int;
+        extern __shared__ int s_int[];
-  }
+        return s_int;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned int>
-struct SharedMemory<unsigned int> {
+{
-  __device__ unsigned int *getPointer() {
+    __device__ unsigned int *getPointer()
-    extern __shared__ unsigned int s_uint[];
+    {
-    return s_uint;
+        extern __shared__ unsigned int s_uint[];
-  }
+        return s_uint;
    }
 };
-template <>
+template <> struct SharedMemory<char>
-struct SharedMemory<char> {
+{
-  __device__ char *getPointer() {
+    __device__ char *getPointer()
-    extern __shared__ char s_char[];
+    {
-    return s_char;
+        extern __shared__ char s_char[];
-  }
+        return s_char;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned char>
-struct SharedMemory<unsigned char> {
+{
-  __device__ unsigned char *getPointer() {
+    __device__ unsigned char *getPointer()
-    extern __shared__ unsigned char s_uchar[];
+    {
-    return s_uchar;
+        extern __shared__ unsigned char s_uchar[];
-  }
+        return s_uchar;
    }
 };
-template <>
+template <> struct SharedMemory<short>
-struct SharedMemory<short> {
+{
-  __device__ short *getPointer() {
+    __device__ short *getPointer()
-    extern __shared__ short s_short[];
+    {
-    return s_short;
+        extern __shared__ short s_short[];
-  }
+        return s_short;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned short>
-struct SharedMemory<unsigned short> {
+{
-  __device__ unsigned short *getPointer() {
+    __device__ unsigned short *getPointer()
-    extern __shared__ unsigned short s_ushort[];
+    {
-    return s_ushort;
+        extern __shared__ unsigned short s_ushort[];
-  }
+        return s_ushort;
    }
 };
-template <>
+template <> struct SharedMemory<long>
-struct SharedMemory<long> {
+{
-  __device__ long *getPointer() {
+    __device__ long *getPointer()
-    extern __shared__ long s_long[];
+    {
-    return s_long;
+        extern __shared__ long s_long[];
-  }
+        return s_long;
    }
 };
-template <>
+template <> struct SharedMemory<unsigned long>
-struct SharedMemory<unsigned long> {
+{
-  __device__ unsigned long *getPointer() {
+    __device__ unsigned long *getPointer()
-    extern __shared__ unsigned long s_ulong[];
+    {
-    return s_ulong;
+        extern __shared__ unsigned long s_ulong[];
-  }
+        return s_ulong;
    }
 };
-template <>
+template <> struct SharedMemory<bool>
-struct SharedMemory<bool> {
+{
-  __device__ bool *getPointer() {
+    __device__ bool *getPointer()
-    extern __shared__ bool s_bool[];
+    {
-    return s_bool;
+        extern __shared__ bool s_bool[];
-  }
+        return s_bool;
    }
 };
-template <>
+template <> struct SharedMemory<float>
-struct SharedMemory<float> {
+{
-  __device__ float *getPointer() {
+    __device__ float *getPointer()
-    extern __shared__ float s_float[];
+    {
-    return s_float;
+        extern __shared__ float s_float[];
-  }
+        return s_float;
    }
 };
-template <>
+template <> struct SharedMemory<double>
-struct SharedMemory<double> {
+{
-  __device__ double *getPointer() {
+    __device__ double *getPointer()
-    extern __shared__ double s_double[];
+    {
-    return s_double;
+        extern __shared__ double s_double[];
-  }
+        return s_double;
    }
 };
-#endif  //_SHAREDMEM_H_
+#endif //_SHAREDMEM_H_
--- a/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu
+++ b/Samples/0_Introduction/simpleTemplates/simpleTemplates.cu
@ -26,23 +26,23 @@
 */
 /* This sample is a templatized version of the template project.
-* It also shows how to correctly templatize dynamically allocated shared
+ * It also shows how to correctly templatize dynamically allocated shared
-* memory arrays.
+ * memory arrays.
-* Host code.
+ * Host code.
-*/
+ */
 // System includes
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <string.h>
 // CUDA runtime
 #include <cuda_runtime.h>
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #ifndef MAX
 #define MAX(a, b) (a > b ? a : b)
@ -58,55 +58,55 @@ int g_TotalFailures = 0;
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-template <class T>
+template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
-__global__ void testKernel(T *g_idata, T *g_odata) {
+{
-  // Shared mem size is determined by the host app at run time
+    // Shared mem size is determined by the host app at run time
-  SharedMemory<T> smem;
+    SharedMemory<T> smem;
-  T *sdata = smem.getPointer();
+    T              *sdata = smem.getPointer();
-  // access thread id
+    // access thread id
-  const unsigned int tid = threadIdx.x;
+    const unsigned int tid = threadIdx.x;
-  // access number of threads in this block
+    // access number of threads in this block
-  const unsigned int num_threads = blockDim.x;
+    const unsigned int num_threads = blockDim.x;
-  // read in input data from global memory
+    // read in input data from global memory
-  sdata[tid] = g_idata[tid];
+    sdata[tid] = g_idata[tid];
-  __syncthreads();
+    __syncthreads();
-  // perform some computations
+    // perform some computations
-  sdata[tid] = (T)num_threads * sdata[tid];
+    sdata[tid] = (T)num_threads * sdata[tid];
-  __syncthreads();
+    __syncthreads();
-  // write data to global memory
+    // write data to global memory
-  g_odata[tid] = sdata[tid];
+    g_odata[tid] = sdata[tid];
 }
 ////////////////////////////////////////////////////////////////////////////////
 // declaration, forward
-template <class T>
+template <class T> void runTest(int argc, char **argv, int len);
 void runTest(int argc, char **argv, int len);
-template <class T>
+template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
-void computeGold(T *reference, T *idata, const unsigned int len) {
+{
-  const T T_len = static_cast<T>(len);
+    const T T_len = static_cast<T>(len);
-  for (unsigned int i = 0; i < len; ++i) {
+    for (unsigned int i = 0; i < len; ++i) {
-    reference[i] = idata[i] * T_len;
+        reference[i] = idata[i] * T_len;
-  }
+    }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("> runTest<float,32>\n");
+{
-  runTest<float>(argc, argv, 32);
+    printf("> runTest<float,32>\n");
-  printf("> runTest<int,64>\n");
+    runTest<float>(argc, argv, 32);
-  runTest<int>(argc, argv, 64);
+    printf("> runTest<int,64>\n");
    runTest<int>(argc, argv, 64);
-  printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);
+    printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);
-  exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 // To completely templatize runTest (below) with cutil, we need to use
@ -114,151 +114,152 @@ int main(int argc, char **argv) {
 // functions for different types.
 // Here's the generic wrapper for cutCompare*
-template <class T>
+template <class T> class ArrayComparator
-class ArrayComparator {
+{
- public:
+public:
-  bool compare(const T *reference, T *data, unsigned int len) {
+    bool compare(const T *reference, T *data, unsigned int len)
-    fprintf(stderr,
+    {
-            "Error: no comparison function implemented for this type\n");
+        fprintf(stderr, "Error: no comparison function implemented for this type\n");
-    return false;
+        return false;
-  }
+    }
 };
 // Here's the specialization for ints:
-template <>
+template <> class ArrayComparator<int>
-class ArrayComparator<int> {
+{
- public:
+public:
-  bool compare(const int *reference, int *data, unsigned int len) {
+    bool compare(const int *reference, int *data, unsigned int len)
-    return compareData(reference, data, len, 0.15f, 0.0f);
+    {
-  }
+        return compareData(reference, data, len, 0.15f, 0.0f);
    }
 };
 // Here's the specialization for floats:
-template <>
+template <> class ArrayComparator<float>
-class ArrayComparator<float> {
+{
- public:
+public:
-  bool compare(const float *reference, float *data, unsigned int len) {
+    bool compare(const float *reference, float *data, unsigned int len)
-    return compareData(reference, data, len, 0.15f, 0.15f);
+    {
-  }
+        return compareData(reference, data, len, 0.15f, 0.15f);
    }
 };
 // Here's the generic wrapper for cutWriteFile*
-template <class T>
+template <class T> class ArrayFileWriter
-class ArrayFileWriter {
+{
- public:
+public:
-  bool write(const char *filename, T *data, unsigned int len, float epsilon) {
+    bool write(const char *filename, T *data, unsigned int len, float epsilon)
-    fprintf(stderr,
+    {
-            "Error: no file write function implemented for this type\n");
+        fprintf(stderr, "Error: no file write function implemented for this type\n");
-    return false;
+        return false;
-  }
+    }
 };
 // Here's the specialization for ints:
-template <>
+template <> class ArrayFileWriter<int>
-class ArrayFileWriter<int> {
+{
- public:
+public:
-  bool write(const char *filename, int *data, unsigned int len, float epsilon) {
+    bool write(const char *filename, int *data, unsigned int len, float epsilon)
-    return sdkWriteFile(filename, data, len, epsilon, false);
+    {
-  }
+        return sdkWriteFile(filename, data, len, epsilon, false);
    }
 };
 // Here's the specialization for floats:
-template <>
+template <> class ArrayFileWriter<float>
-class ArrayFileWriter<float> {
+{
- public:
+public:
-  bool write(const char *filename, float *data, unsigned int len,
+    bool write(const char *filename, float *data, unsigned int len, float epsilon)
-             float epsilon) {
+    {
-    return sdkWriteFile(filename, data, len, epsilon, false);
+        return sdkWriteFile(filename, data, len, epsilon, false);
-  }
+    }
 };
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-template <class T>
+template <class T> void runTest(int argc, char **argv, int len)
-void runTest(int argc, char **argv, int len) {
+{
-  int devID;
+    int            devID;
-  cudaDeviceProp deviceProps;
+    cudaDeviceProp deviceProps;
-  devID = findCudaDevice(argc, (const char **)argv);
+    devID = findCudaDevice(argc, (const char **)argv);
-  // get number of SMs on this GPU
+    // get number of SMs on this GPU
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
-  printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name,
+    printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
         deviceProps.multiProcessorCount);
-  // create and start timer
+    // create and start timer
-  StopWatchInterface *timer = NULL;
+    StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  // start the timer
+    // start the timer
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  unsigned int num_threads = len;
+    unsigned int num_threads = len;
-  unsigned int mem_size = sizeof(float) * num_threads;
+    unsigned int mem_size    = sizeof(float) * num_threads;
-  // allocate host memory
+    // allocate host memory
-  T *h_idata = (T *)malloc(mem_size);
+    T *h_idata = (T *)malloc(mem_size);
-  // initialize the memory
+    // initialize the memory
-  for (unsigned int i = 0; i < num_threads; ++i) {
+    for (unsigned int i = 0; i < num_threads; ++i) {
-    h_idata[i] = (T)i;
+        h_idata[i] = (T)i;
-  }
+    }
-  // allocate device memory
+    // allocate device memory
-  T *d_idata;
+    T *d_idata;
-  checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
+    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
-  // copy host memory to device
+    // copy host memory to device
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
-  // allocate device memory for result
+    // allocate device memory for result
-  T *d_odata;
+    T *d_odata;
-  checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
+    checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
-  // setup execution parameters
+    // setup execution parameters
-  dim3 grid(1, 1, 1);
+    dim3 grid(1, 1, 1);
-  dim3 threads(num_threads, 1, 1);
+    dim3 threads(num_threads, 1, 1);
-  // execute the kernel
+    // execute the kernel
-  testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);
+    testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);
-  // check if kernel execution generated and error
+    // check if kernel execution generated and error
-  getLastCudaError("Kernel execution failed");
+    getLastCudaError("Kernel execution failed");
-  // allocate mem for the result on host side
+    // allocate mem for the result on host side
-  T *h_odata = (T *)malloc(mem_size);
+    T *h_odata = (T *)malloc(mem_size);
-  // copy result from device to host
+    // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads,
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkDeleteTimer(&timer);
-  // compute reference solution
+    // compute reference solution
-  T *reference = (T *)malloc(mem_size);
+    T *reference = (T *)malloc(mem_size);
-  computeGold<T>(reference, h_idata, num_threads);
+    computeGold<T>(reference, h_idata, num_threads);
-  ArrayComparator<T> comparator;
+    ArrayComparator<T> comparator;
-  ArrayFileWriter<T> writer;
+    ArrayFileWriter<T> writer;
-  // check result
+    // check result
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
+        // write file for regression test
-    writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
+        writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
-  } else {
+    }
-    // custom output handling when no regression test running
+    else {
-    // in this case check if the result is equivalent to the expected solution
+        // custom output handling when no regression test running
-    bool res = comparator.compare(reference, h_odata, num_threads);
+        // in this case check if the result is equivalent to the expected solution
-    printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
+        bool res = comparator.compare(reference, h_odata, num_threads);
-    g_TotalFailures += (1 != res);
+        printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
-  }
+        g_TotalFailures += (1 != res);
    }
-  // cleanup memory
+    // cleanup memory
-  free(h_idata);
+    free(h_idata);
-  free(h_odata);
+    free(h_odata);
-  free(reference);
+    free(reference);
-  checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(d_idata));
-  checkCudaErrors(cudaFree(d_odata));
+    checkCudaErrors(cudaFree(d_odata));
 }
--- a/Samples/0_Introduction/simpleTexture/simpleTexture.cu
+++ b/Samples/0_Introduction/simpleTexture/simpleTexture.cu
@ -34,10 +34,10 @@
 */
 // Includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifdef _WIN32
 #define WINDOWS_LEAN_AND_MEAN
@ -49,22 +49,22 @@
 #include <cuda_runtime.h>
 // Utilities and timing functions
-#include <helper_functions.h>  // includes cuda.h and cuda_runtime_api.h
+#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
 // CUDA helper functions
-#include <helper_cuda.h>  // helper functions for CUDA error check
+#include <helper_cuda.h> // helper functions for CUDA error check
 #define MAX_EPSILON_ERROR 5e-3f
 // Define the files that are to be save and the reference images for validation
 const char *imageFilename = "teapot512.pgm";
-const char *refFilename = "ref_rotated.pgm";
+const char *refFilename   = "ref_rotated.pgm";
 const char *sampleName = "simpleTexture";
 ////////////////////////////////////////////////////////////////////////////////
 // Constants
-const float angle = 0.5f;  // angle to rotate image by (in radians)
+const float angle = 0.5f; // angle to rotate image by (in radians)
 // Auto-Verification Code
 bool testResult = true;
@ -73,22 +73,22 @@ bool testResult = true;
 //! Transform an image using texture lookups
 //! @param outputData  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void transformKernel(float *outputData, int width, int height,
+__global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
-                                float theta, cudaTextureObject_t tex) {
+{
-  // calculate normalized texture coordinates
+    // calculate normalized texture coordinates
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-  float u = (float)x - (float)width / 2;
+    float u  = (float)x - (float)width / 2;
-  float v = (float)y - (float)height / 2;
+    float v  = (float)y - (float)height / 2;
-  float tu = u * cosf(theta) - v * sinf(theta);
+    float tu = u * cosf(theta) - v * sinf(theta);
-  float tv = v * cosf(theta) + u * sinf(theta);
+    float tv = v * cosf(theta) + u * sinf(theta);
-  tu /= (float)width;
+    tu /= (float)width;
-  tv /= (float)height;
+    tv /= (float)height;
-  // read from texture and write to global memory
+    // read from texture and write to global memory
-  outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
+    outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -98,154 +98,151 @@ void runTest(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("%s starting...\n", sampleName);
+{
    printf("%s starting...\n", sampleName);
-  // Process command-line arguments
+    // Process command-line arguments
-  if (argc > 1) {
+    if (argc > 1) {
-    if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
+        if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
-      getCmdLineArgumentString(argc, (const char **)argv, "input",
+            getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
                               (char **)&imageFilename);
-      if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+            if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-        getCmdLineArgumentString(argc, (const char **)argv, "reference",
+                getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
-                                 (char **)&refFilename);
+            }
-      } else {
+            else {
-        printf("-input flag should be used with -reference flag");
+                printf("-input flag should be used with -reference flag");
-        exit(EXIT_FAILURE);
+                exit(EXIT_FAILURE);
-      }
+            }
-    } else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
+        }
-      printf("-reference flag should be used with -input flag");
+        else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
-      exit(EXIT_FAILURE);
+            printf("-reference flag should be used with -input flag");
            exit(EXIT_FAILURE);
        }
    }
  }
-  runTest(argc, argv);
+    runTest(argc, argv);
-  printf("%s completed, returned %s\n", sampleName,
+    printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
-         testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  int devID = findCudaDevice(argc, (const char **)argv);
+{
    int devID = findCudaDevice(argc, (const char **)argv);
-  // load image from disk
+    // load image from disk
-  float *hData = NULL;
+    float       *hData = NULL;
-  unsigned int width, height;
+    unsigned int width, height;
-  char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
+    char        *imagePath = sdkFindFilePath(imageFilename, argv[0]);
-  if (imagePath == NULL) {
+    if (imagePath == NULL) {
-    printf("Unable to source image file: %s\n", imageFilename);
+        printf("Unable to source image file: %s\n", imageFilename);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  sdkLoadPGM(imagePath, &hData, &width, &height);
+    sdkLoadPGM(imagePath, &hData, &width, &height);
-  unsigned int size = width * height * sizeof(float);
+    unsigned int size = width * height * sizeof(float);
-  printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
+    printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
-  // Load reference image from image (output)
+    // Load reference image from image (output)
-  float *hDataRef = (float *)malloc(size);
+    float *hDataRef = (float *)malloc(size);
-  char *refPath = sdkFindFilePath(refFilename, argv[0]);
+    char  *refPath  = sdkFindFilePath(refFilename, argv[0]);
-  if (refPath == NULL) {
+    if (refPath == NULL) {
-    printf("Unable to find reference image file: %s\n", refFilename);
+        printf("Unable to find reference image file: %s\n", refFilename);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  sdkLoadPGM(refPath, &hDataRef, &width, &height);
+    sdkLoadPGM(refPath, &hDataRef, &width, &height);
-  // Allocate device memory for result
+    // Allocate device memory for result
-  float *dData = NULL;
+    float *dData = NULL;
-  checkCudaErrors(cudaMalloc((void **)&dData, size));
+    checkCudaErrors(cudaMalloc((void **)&dData, size));
-  // Allocate array and copy image data
+    // Allocate array and copy image data
-  cudaChannelFormatDesc channelDesc =
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+    cudaArray            *cuArray;
-  cudaArray *cuArray;
+    checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
-  checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
+    checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
  checkCudaErrors(
      cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
-  cudaTextureObject_t tex;
+    cudaTextureObject_t tex;
-  cudaResourceDesc texRes;
+    cudaResourceDesc    texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
-  texRes.resType = cudaResourceTypeArray;
+    texRes.resType         = cudaResourceTypeArray;
-  texRes.res.array.array = cuArray;
+    texRes.res.array.array = cuArray;
-  cudaTextureDesc texDescr;
+    cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-  texDescr.normalizedCoords = true;
+    texDescr.normalizedCoords = true;
-  texDescr.filterMode = cudaFilterModeLinear;
+    texDescr.filterMode       = cudaFilterModeLinear;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
+    texDescr.addressMode[0]   = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
+    texDescr.addressMode[1]   = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeElementType;
+    texDescr.readMode         = cudaReadModeElementType;
-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
-  dim3 dimBlock(8, 8, 1);
+    dim3 dimBlock(8, 8, 1);
-  dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
+    dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
-  // Warmup
+    // Warmup
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  StopWatchInterface *timer = NULL;
+    StopWatchInterface *timer = NULL;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  // Execute the kernel
+    // Execute the kernel
-  transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
+    transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
-  // Check if kernel execution generated an error
+    // Check if kernel execution generated an error
-  getLastCudaError("Kernel execution failed");
+    getLastCudaError("Kernel execution failed");
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  printf("%.2f Mpixels/sec\n",
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
-         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+    sdkDeleteTimer(&timer);
  sdkDeleteTimer(&timer);
-  // Allocate mem for the result on host side
+    // Allocate mem for the result on host side
-  float *hOutputData = (float *)malloc(size);
+    float *hOutputData = (float *)malloc(size);
-  // copy result from device to host
+    // copy result from device to host
-  checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));
-  // Write result to file
+    // Write result to file
-  char outputFilename[1024];
+    char outputFilename[1024];
-  strcpy(outputFilename, imagePath);
+    strcpy(outputFilename, imagePath);
-  strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
+    strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
-  sdkSavePGM(outputFilename, hOutputData, width, height);
+    sdkSavePGM(outputFilename, hOutputData, width, height);
-  printf("Wrote '%s'\n", outputFilename);
+    printf("Wrote '%s'\n", outputFilename);
-  // Write regression file if necessary
+    // Write regression file if necessary
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // Write file for regression test
+        // Write file for regression test
-    sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height,
+        sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
-                        0.0f, false);
+    }
-  } else {
+    else {
-    // We need to reload the data from disk,
+        // We need to reload the data from disk,
-    // because it is inverted upon output
+        // because it is inverted upon output
-    sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
+        sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
-    printf("Comparing files\n");
+        printf("Comparing files\n");
-    printf("\toutput:    <%s>\n", outputFilename);
+        printf("\toutput:    <%s>\n", outputFilename);
-    printf("\treference: <%s>\n", refPath);
+        printf("\treference: <%s>\n", refPath);
-    testResult = compareData(hOutputData, hDataRef, width * height,
+        testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
-                             MAX_EPSILON_ERROR, 0.15f);
+    }
  }
-  checkCudaErrors(cudaDestroyTextureObject(tex));
+    checkCudaErrors(cudaDestroyTextureObject(tex));
-  checkCudaErrors(cudaFree(dData));
+    checkCudaErrors(cudaFree(dData));
-  checkCudaErrors(cudaFreeArray(cuArray));
+    checkCudaErrors(cudaFreeArray(cuArray));
-  free(imagePath);
+    free(imagePath);
-  free(refPath);
+    free(refPath);
 }
--- a/Samples/0_Introduction/simpleTexture3D/README.md
+++ b/Samples/0_Introduction/simpleTexture3D/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp
+++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D.cpp
@ -32,11 +32,11 @@
  using 3D texture lookups.
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <helper_gl.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #if defined(__APPLE__) || defined(MACOSX)
 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
@ -49,53 +49,52 @@
 #endif
 // includes, cuda
 #include <vector_types.h>
 #include <cuda_runtime.h>
 #include <cuda_gl_interop.h>
 #include <cuda_runtime.h>
 #include <vector_types.h>
 // CUDA utilities and system includes
 #include <helper_cuda.h>
 #include <helper_functions.h>
 #include <vector_types.h>
-typedef unsigned int uint;
+typedef unsigned int  uint;
 typedef unsigned char uchar;
 #define MAX_EPSILON_ERROR 5.0f
-#define THRESHOLD 0.15f
+#define THRESHOLD         0.15f
 const char *sSDKsample = "simpleTexture3D";
-const char *volumeFilename = "Bucky.raw";
+const char      *volumeFilename = "Bucky.raw";
-const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
+const cudaExtent volumeSize     = make_cudaExtent(32, 32, 32);
 const uint width = 512, height = 512;
 const dim3 blockSize(16, 16, 1);
 const dim3 gridSize(width / blockSize.x, height / blockSize.y);
-float w = 0.5;  // texture coordinate in z
+float w = 0.5; // texture coordinate in z
-GLuint pbo;  // OpenGL pixel buffer object
+GLuint                       pbo;               // OpenGL pixel buffer object
-struct cudaGraphicsResource
+struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
    *cuda_pbo_resource;  // CUDA Graphics Resource (to transfer PBO)
 bool linearFiltering = true;
-bool animate = true;
+bool animate         = true;
 StopWatchInterface *timer = NULL;
 uint *d_output = NULL;
 // Auto-Verification Code
-const int frameCheckNumber = 4;
+const int    frameCheckNumber  = 4;
-int fpsCount = 0;  // FPS count for averaging
+int          fpsCount          = 0; // FPS count for averaging
-int fpsLimit = 1;  // FPS limit for sampling
+int          fpsLimit          = 1; // FPS limit for sampling
-int g_Index = 0;
+int          g_Index           = 0;
-unsigned int frameCount = 0;
+unsigned int frameCount        = 0;
-unsigned int g_TotalErrors = 0;
+unsigned int g_TotalErrors     = 0;
 volatile int g_GraphicsMapFlag = 0;
-int *pArgc = NULL;
+int   *pArgc = NULL;
 char **pArgv = NULL;
 #ifndef MAX
@ -105,288 +104,294 @@ char **pArgv = NULL;
 extern "C" void cleanup();
 extern "C" void setTextureFilterMode(bool bLinearFilter);
 extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
-extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
+extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
-                              uint imageW, uint imageH, float w);
+extern void     cleanupCuda();
 extern void cleanupCuda();
 void loadVolumeData(char *exec_path);
-void computeFPS() {
+void computeFPS()
-  frameCount++;
+{
-  fpsCount++;
+    frameCount++;
    fpsCount++;
-  if (fpsCount == fpsLimit) {
+    if (fpsCount == fpsLimit) {
-    char fps[256];
+        char  fps[256];
-    float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
+        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
-    sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);
+        sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);
-    glutSetWindowTitle(fps);
+        glutSetWindowTitle(fps);
-    fpsCount = 0;
+        fpsCount = 0;
-    fpsLimit = ftoi(MAX(1.0f, ifps));
+        fpsLimit = ftoi(MAX(1.0f, ifps));
-    sdkResetTimer(&timer);
+        sdkResetTimer(&timer);
-  }
+    }
 }
 // render image using CUDA
-void render() {
+void render()
-  // map PBO to get CUDA device pointer
+{
-  g_GraphicsMapFlag++;
+    // map PBO to get CUDA device pointer
-  checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
+    g_GraphicsMapFlag++;
-  size_t num_bytes;
+    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
-  checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
+    size_t num_bytes;
-      (void **)&d_output, &num_bytes, cuda_pbo_resource));
+    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
-  // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
+    // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
-  // call CUDA kernel, writing results to PBO
+    // call CUDA kernel, writing results to PBO
-  render_kernel(gridSize, blockSize, d_output, width, height, w);
+    render_kernel(gridSize, blockSize, d_output, width, height, w);
-  getLastCudaError("render_kernel failed");
+    getLastCudaError("render_kernel failed");
-  if (g_GraphicsMapFlag) {
+    if (g_GraphicsMapFlag) {
-    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
+        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
-    g_GraphicsMapFlag--;
+        g_GraphicsMapFlag--;
-  }
+    }
 }
 // display results using OpenGL (called by GLUT)
-void display() {
+void display()
-  sdkStartTimer(&timer);
+{
    sdkStartTimer(&timer);
-  render();
+    render();
-  // display results
+    // display results
-  glClear(GL_COLOR_BUFFER_BIT);
+    glClear(GL_COLOR_BUFFER_BIT);
-  // draw image from PBO
+    // draw image from PBO
-  glDisable(GL_DEPTH_TEST);
+    glDisable(GL_DEPTH_TEST);
-  glRasterPos2i(0, 0);
+    glRasterPos2i(0, 0);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
-  glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+    glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
-  glutSwapBuffers();
+    glutSwapBuffers();
-  glutReportErrors();
+    glutReportErrors();
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  computeFPS();
+    computeFPS();
 }
-void idle() {
+void idle()
-  if (animate) {
+{
-    w += 0.01f;
+    if (animate) {
-    glutPostRedisplay();
+        w += 0.01f;
-  }
+        glutPostRedisplay();
    }
 }
-void keyboard(unsigned char key, int x, int y) {
+void keyboard(unsigned char key, int x, int y)
-  switch (key) {
+{
    switch (key) {
    case 27:
 #if defined(__APPLE__) || defined(MACOSX)
-      exit(EXIT_SUCCESS);
+        exit(EXIT_SUCCESS);
-      glutDestroyWindow(glutGetWindow());
+        glutDestroyWindow(glutGetWindow());
-      return;
+        return;
 #else
-      glutDestroyWindow(glutGetWindow());
+        glutDestroyWindow(glutGetWindow());
-      return;
+        return;
 #endif
    case '=':
    case '+':
-      w += 0.01f;
+        w += 0.01f;
-      break;
+        break;
    case '-':
-      w -= 0.01f;
+        w -= 0.01f;
-      break;
+        break;
    case 'f':
-      linearFiltering = !linearFiltering;
+        linearFiltering = !linearFiltering;
-      setTextureFilterMode(linearFiltering);
+        setTextureFilterMode(linearFiltering);
-      break;
+        break;
    case ' ':
-      animate = !animate;
+        animate = !animate;
-      break;
+        break;
    default:
-      break;
+        break;
-  }
+    }
-  glutPostRedisplay();
+    glutPostRedisplay();
 }
-void reshape(int x, int y) {
+void reshape(int x, int y)
-  glViewport(0, 0, x, y);
+{
    glViewport(0, 0, x, y);
-  glMatrixMode(GL_MODELVIEW);
+    glMatrixMode(GL_MODELVIEW);
-  glLoadIdentity();
+    glLoadIdentity();
-  glMatrixMode(GL_PROJECTION);
+    glMatrixMode(GL_PROJECTION);
-  glLoadIdentity();
+    glLoadIdentity();
-  glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
 }
-void cleanup() {
+void cleanup()
-  sdkDeleteTimer(&timer);
+{
    sdkDeleteTimer(&timer);
-  // add extra check to unmap the resource before unregistering it
+    // add extra check to unmap the resource before unregistering it
-  if (g_GraphicsMapFlag) {
+    if (g_GraphicsMapFlag) {
-    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
+        checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
-    g_GraphicsMapFlag--;
+        g_GraphicsMapFlag--;
-  }
+    }
-  // unregister this buffer object from CUDA C
+    // unregister this buffer object from CUDA C
-  checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
+    checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
-  glDeleteBuffers(1, &pbo);
+    glDeleteBuffers(1, &pbo);
-  cleanupCuda();
+    cleanupCuda();
 }
-void initGLBuffers() {
+void initGLBuffers()
-  // create pixel buffer object
+{
-  glGenBuffers(1, &pbo);
+    // create pixel buffer object
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glGenBuffers(1, &pbo);
-  glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4,
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
-               0, GL_STREAM_DRAW_ARB);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
-  glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
-  // register this buffer object with CUDA
+    // register this buffer object with CUDA
-  checkCudaErrors(cudaGraphicsGLRegisterBuffer(
+    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
      &cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
 }
 // Load raw data from disk
-uchar *loadRawFile(const char *filename, size_t size) {
+uchar *loadRawFile(const char *filename, size_t size)
-  FILE *fp = fopen(filename, "rb");
+{
    FILE *fp = fopen(filename, "rb");
-  if (!fp) {
+    if (!fp) {
-    fprintf(stderr, "Error opening file '%s'\n", filename);
+        fprintf(stderr, "Error opening file '%s'\n", filename);
-    return 0;
+        return 0;
-  }
+    }
-  uchar *data = (uchar *)malloc(size);
+    uchar *data = (uchar *)malloc(size);
-  size_t read = fread(data, 1, size, fp);
+    size_t read = fread(data, 1, size, fp);
-  fclose(fp);
+    fclose(fp);
-  printf("Read '%s', %zu bytes\n", filename, read);
+    printf("Read '%s', %zu bytes\n", filename, read);
-  return data;
+    return data;
 }
-void initGL(int *argc, char **argv) {
+void initGL(int *argc, char **argv)
-  // initialize GLUT callback functions
+{
-  glutInit(argc, argv);
+    // initialize GLUT callback functions
-  glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
+    glutInit(argc, argv);
-  glutInitWindowSize(width, height);
+    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
-  glutCreateWindow("CUDA 3D texture");
+    glutInitWindowSize(width, height);
-  glutDisplayFunc(display);
+    glutCreateWindow("CUDA 3D texture");
-  glutKeyboardFunc(keyboard);
+    glutDisplayFunc(display);
-  glutReshapeFunc(reshape);
+    glutKeyboardFunc(keyboard);
-  glutIdleFunc(idle);
+    glutReshapeFunc(reshape);
    glutIdleFunc(idle);
-  if (!isGLVersionSupported(2, 0) ||
+    if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
-      !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
+        fprintf(stderr, "Required OpenGL extensions are missing.");
-    fprintf(stderr, "Required OpenGL extensions are missing.");
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
 }
-void runAutoTest(const char *ref_file, char *exec_path) {
+void runAutoTest(const char *ref_file, char *exec_path)
-  checkCudaErrors(
+{
-      cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
+    checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
-  // render the volumeData
+    // render the volumeData
-  render_kernel(gridSize, blockSize, d_output, width, height, w);
+    render_kernel(gridSize, blockSize, d_output, width, height, w);
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  getLastCudaError("render_kernel failed");
+    getLastCudaError("render_kernel failed");
-  void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
+    void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
-  checkCudaErrors(cudaMemcpy(h_output, d_output,
+    checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
-                             width * height * sizeof(GLubyte) * 4,
+    sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");
                             cudaMemcpyDeviceToHost));
  sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
             "simpleTexture3D.bin");
-  bool bTestResult = sdkCompareBin2BinFloat(
+    bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
-      "simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path),
+                                              sdkFindFilePath(ref_file, exec_path),
-      width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path);
+                                              width * height,
                                              MAX_EPSILON_ERROR,
                                              THRESHOLD,
                                              exec_path);
-  checkCudaErrors(cudaFree(d_output));
+    checkCudaErrors(cudaFree(d_output));
-  free(h_output);
+    free(h_output);
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  sdkDeleteTimer(&timer);
+    sdkDeleteTimer(&timer);
-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-void loadVolumeData(char *exec_path) {
+void loadVolumeData(char *exec_path)
-  // load volume data
+{
-  const char *path = sdkFindFilePath(volumeFilename, exec_path);
+    // load volume data
    const char *path = sdkFindFilePath(volumeFilename, exec_path);
-  if (path == NULL) {
+    if (path == NULL) {
-    fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n",
+        fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
-            volumeFilename);
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;
+    size_t size     = volumeSize.width * volumeSize.height * volumeSize.depth;
-  uchar *h_volume = loadRawFile(path, size);
+    uchar *h_volume = loadRawFile(path, size);
-  initCuda(h_volume, volumeSize);
+    initCuda(h_volume, volumeSize);
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  free(h_volume);
+    free(h_volume);
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  pArgc = &argc;
+{
-  pArgv = argv;
+    pArgc = &argc;
    pArgv = argv;
-  char *ref_file = NULL;
+    char *ref_file = NULL;
 #if defined(__linux__)
-  setenv("DISPLAY", ":0", 0);
+    setenv("DISPLAY", ":0", 0);
 #endif
-  printf("%s Starting...\n\n", sSDKsample);
+    printf("%s Starting...\n\n", sSDKsample);
-  if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
-    fpsLimit = frameCheckNumber;
+        fpsLimit = frameCheckNumber;
-    getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
+        getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
-  }
+    }
-  // use command-line specified CUDA device, otherwise use device with highest
+    // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
+    // Gflops/s
-  findCudaDevice(argc, (const char **)argv);
+    findCudaDevice(argc, (const char **)argv);
-  if (ref_file) {
+    if (ref_file) {
-    loadVolumeData(argv[0]);
+        loadVolumeData(argv[0]);
-    runAutoTest(ref_file, argv[0]);
+        runAutoTest(ref_file, argv[0]);
-  } else {
+    }
-    initGL(&argc, argv);
+    else {
        initGL(&argc, argv);
-    // OpenGL buffers
+        // OpenGL buffers
-    initGLBuffers();
+        initGLBuffers();
-    loadVolumeData(argv[0]);
+        loadVolumeData(argv[0]);
-  }
+    }
-  printf(
+    printf("Press space to toggle animation\n"
-      "Press space to toggle animation\n"
+           "Press '+' and '-' to change displayed slice\n");
      "Press '+' and '-' to change displayed slice\n");
 #if defined(__APPLE__) || defined(MACOSX)
-  atexit(cleanup);
+    atexit(cleanup);
 #else
-  glutCloseFunc(cleanup);
+    glutCloseFunc(cleanup);
 #endif
-  glutMainLoop();
+    glutMainLoop();
-  exit(EXIT_SUCCESS);
+    exit(EXIT_SUCCESS);
 }
--- a/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu
+++ b/Samples/0_Introduction/simpleTexture3D/simpleTexture3D_kernel.cu
@ -28,111 +28,111 @@
 #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
 #define _SIMPLETEXTURE3D_KERNEL_CU_
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <helper_cuda.h>
 #include <helper_math.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-typedef unsigned int uint;
+typedef unsigned int  uint;
 typedef unsigned char uchar;
-cudaArray *d_volumeArray = 0;
+cudaArray          *d_volumeArray = 0;
-cudaTextureObject_t tex;  // 3D texture
+cudaTextureObject_t tex; // 3D texture
-__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
+__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
-                         cudaTextureObject_t texObj) {
+{
-  uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
+    uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-  uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
+    uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
-  float u = x / (float)imageW;
+    float u = x / (float)imageW;
-  float v = y / (float)imageH;
+    float v = y / (float)imageH;
-  // read from 3D texture
+    // read from 3D texture
-  float voxel = tex3D<float>(texObj, u, v, w);
+    float voxel = tex3D<float>(texObj, u, v, w);
-  if ((x < imageW) && (y < imageH)) {
+    if ((x < imageW) && (y < imageH)) {
-    // write output color
+        // write output color
-    uint i = __umul24(y, imageW) + x;
+        uint i      = __umul24(y, imageW) + x;
-    d_output[i] = voxel * 255;
+        d_output[i] = voxel * 255;
-  }
+    }
 }
-extern "C" void setTextureFilterMode(bool bLinearFilter) {
+extern "C" void setTextureFilterMode(bool bLinearFilter)
-  if (tex) {
+{
-    checkCudaErrors(cudaDestroyTextureObject(tex));
+    if (tex) {
-  }
+        checkCudaErrors(cudaDestroyTextureObject(tex));
-  cudaResourceDesc texRes;
+    }
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    cudaResourceDesc texRes;
    memset(&texRes, 0, sizeof(cudaResourceDesc));
-  texRes.resType = cudaResourceTypeArray;
+    texRes.resType         = cudaResourceTypeArray;
-  texRes.res.array.array = d_volumeArray;
+    texRes.res.array.array = d_volumeArray;
-  cudaTextureDesc texDescr;
+    cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-  texDescr.normalizedCoords = true;
+    texDescr.normalizedCoords = true;
-  texDescr.filterMode =
+    texDescr.filterMode       = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
-      bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
+    ;
-  ;
+    texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[0] = cudaAddressModeWrap;
+    texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
+    texDescr.addressMode[2] = cudaAddressModeWrap;
-  texDescr.addressMode[2] = cudaAddressModeWrap;
+    texDescr.readMode       = cudaReadModeNormalizedFloat;
  texDescr.readMode = cudaReadModeNormalizedFloat;
-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 }
-extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
+extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
-  // create 3D array
+{
-  cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
+    // create 3D array
-  checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
    checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
-  // copy data to 3D array
+    // copy data to 3D array
-  cudaMemcpy3DParms copyParams = {0};
+    cudaMemcpy3DParms copyParams = {0};
-  copyParams.srcPtr =
+    copyParams.srcPtr =
-      make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar),
+        make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
-                          volumeSize.width, volumeSize.height);
+    copyParams.dstArray = d_volumeArray;
-  copyParams.dstArray = d_volumeArray;
+    copyParams.extent   = volumeSize;
-  copyParams.extent = volumeSize;
+    copyParams.kind     = cudaMemcpyHostToDevice;
-  copyParams.kind = cudaMemcpyHostToDevice;
+    checkCudaErrors(cudaMemcpy3D(&copyParams));
  checkCudaErrors(cudaMemcpy3D(&copyParams));
-  cudaResourceDesc texRes;
+    cudaResourceDesc texRes;
-  memset(&texRes, 0, sizeof(cudaResourceDesc));
+    memset(&texRes, 0, sizeof(cudaResourceDesc));
-  texRes.resType = cudaResourceTypeArray;
+    texRes.resType         = cudaResourceTypeArray;
-  texRes.res.array.array = d_volumeArray;
+    texRes.res.array.array = d_volumeArray;
-  cudaTextureDesc texDescr;
+    cudaTextureDesc texDescr;
-  memset(&texDescr, 0, sizeof(cudaTextureDesc));
+    memset(&texDescr, 0, sizeof(cudaTextureDesc));
-  // access with normalized texture coordinates
+    // access with normalized texture coordinates
-  texDescr.normalizedCoords = true;
+    texDescr.normalizedCoords = true;
-  // linear interpolation
+    // linear interpolation
-  texDescr.filterMode = cudaFilterModeLinear;
+    texDescr.filterMode = cudaFilterModeLinear;
-  // wrap texture coordinates
+    // wrap texture coordinates
-  texDescr.addressMode[0] = cudaAddressModeWrap;
+    texDescr.addressMode[0] = cudaAddressModeWrap;
-  texDescr.addressMode[1] = cudaAddressModeWrap;
+    texDescr.addressMode[1] = cudaAddressModeWrap;
-  texDescr.addressMode[2] = cudaAddressModeWrap;
+    texDescr.addressMode[2] = cudaAddressModeWrap;
-  texDescr.readMode = cudaReadModeNormalizedFloat;
+    texDescr.readMode       = cudaReadModeNormalizedFloat;
-  checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
 }
-extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
+extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
-                              uint imageW, uint imageH, float w) {
+{
-  d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
+    d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
 }
-void cleanupCuda() {
+void cleanupCuda()
-  if (tex) {
+{
-    checkCudaErrors(cudaDestroyTextureObject(tex));
+    if (tex) {
-  }
+        checkCudaErrors(cudaDestroyTextureObject(tex));
-  if (d_volumeArray) {
+    }
-    checkCudaErrors(cudaFreeArray(d_volumeArray));
+    if (d_volumeArray) {
-  }
+        checkCudaErrors(cudaFreeArray(d_volumeArray));
    }
 }
-#endif  // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
+#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
--- a/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp
+++ b/Samples/0_Introduction/simpleTextureDrv/simpleTextureDrv.cpp
@ -26,29 +26,29 @@
 */
 /*
-* This sample demonstrates how use texture fetches in CUDA
+ * This sample demonstrates how use texture fetches in CUDA
-*
+ *
-* This sample takes an input PGM image (image_filename) and generates
+ * This sample takes an input PGM image (image_filename) and generates
-* an output PGM image (image_filename_out).  This CUDA kernel performs
+ * an output PGM image (image_filename_out).  This CUDA kernel performs
-* a simple 2D transform (rotation) on the texture coordinates (u,v).
+ * a simple 2D transform (rotation) on the texture coordinates (u,v).
-* The results between simpleTexture and simpleTextureDrv are identical.
+ * The results between simpleTexture and simpleTextureDrv are identical.
-* The main difference is the implementation.  simpleTextureDrv makes calls
+ * The main difference is the implementation.  simpleTextureDrv makes calls
-* to the CUDA driver API and demonstrates how to use cuModuleLoad to load
+ * to the CUDA driver API and demonstrates how to use cuModuleLoad to load
-* the CUDA ptx (*.ptx) kernel just prior to kernel launch.
+ * the CUDA ptx (*.ptx) kernel just prior to kernel launch.
-*
+ *
-*/
+ */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <iostream>
 #include <cstring>
 #include <iostream>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes, CUDA
 #include <cuda.h>
 #include <builtin_types.h>
 #include <cuda.h>
 // includes, project
 #include <helper_cuda_drvapi.h>
 #include <helper_functions.h>
@ -56,8 +56,8 @@
 using namespace std;
 const char *image_filename = "teapot512.pgm";
-const char *ref_filename = "ref_rotated.pgm";
+const char *ref_filename   = "ref_rotated.pgm";
-float angle = 0.5f;  // angle to rotate image by (in radians)
+float       angle          = 0.5f; // angle to rotate image by (in radians)
 #define MIN_EPSILON_ERROR 5e-3f
@ -65,8 +65,7 @@ float angle = 0.5f;  // angle to rotate image by (in radians)
 // declaration, forward
 void runTest(int argc, char **argv);
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 static CUresult initCUDA(int argc, char **argv, CUfunction *);
@ -80,212 +79,227 @@ const char *sSDKsample = "simpleTextureDrv (Driver API)";
 ////////////////////////////////////////////////////////////////////////////////
 // Globals
 ////////////////////////////////////////////////////////////////////////////////
-CUdevice cuDevice;
+CUdevice  cuDevice;
 CUcontext cuContext;
-CUmodule cuModule;
+CUmodule  cuModule;
-void showHelp() {
+void showHelp()
-  printf("\n> [%s] Command line options\n", sSDKsample);
+{
-  printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
+    printf("\n> [%s] Command line options\n", sSDKsample);
    printf("\t-device=n          (where n=0,1,2.... for the GPU device)\n\n");
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+{
-    showHelp();
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
-    return 0;
+        showHelp();
-  }
+        return 0;
    }
-  runTest(argc, argv);
+    runTest(argc, argv);
 }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  bool bTestResults = true;
+{
    bool bTestResults = true;
-  // initialize CUDA
+    // initialize CUDA
-  CUfunction transform = NULL;
+    CUfunction transform = NULL;
-  if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
+    if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  // load image from disk
+    // load image from disk
-  float *h_data = NULL;
+    float       *h_data = NULL;
-  unsigned int width, height;
+    unsigned int width, height;
-  char *image_path = sdkFindFilePath(image_filename, argv[0]);
+    char        *image_path = sdkFindFilePath(image_filename, argv[0]);
-  if (image_path == NULL) {
+    if (image_path == NULL) {
-    printf("Unable to find image file: '%s'\n", image_filename);
+        printf("Unable to find image file: '%s'\n", image_filename);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  sdkLoadPGM(image_path, &h_data, &width, &height);
+    sdkLoadPGM(image_path, &h_data, &width, &height);
-  size_t size = width * height * sizeof(float);
+    size_t size = width * height * sizeof(float);
-  printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
+    printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
-  // load reference image from image (output)
+    // load reference image from image (output)
-  float *h_data_ref = (float *)malloc(size);
+    float *h_data_ref = (float *)malloc(size);
-  char *ref_path = sdkFindFilePath(ref_filename, argv[0]);
+    char  *ref_path   = sdkFindFilePath(ref_filename, argv[0]);
-  if (ref_path == NULL) {
+    if (ref_path == NULL) {
-    printf("Unable to find reference file %s\n", ref_filename);
+        printf("Unable to find reference file %s\n", ref_filename);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  sdkLoadPGM(ref_path, &h_data_ref, &width, &height);
+    sdkLoadPGM(ref_path, &h_data_ref, &width, &height);
-  // allocate device memory for result
+    // allocate device memory for result
-  CUdeviceptr d_data = (CUdeviceptr)NULL;
+    CUdeviceptr d_data = (CUdeviceptr)NULL;
-  checkCudaErrors(cuMemAlloc(&d_data, size));
+    checkCudaErrors(cuMemAlloc(&d_data, size));
-  // allocate array and copy image data
+    // allocate array and copy image data
-  CUarray cu_array;
+    CUarray               cu_array;
-  CUDA_ARRAY_DESCRIPTOR desc;
+    CUDA_ARRAY_DESCRIPTOR desc;
-  desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.Format      = CU_AD_FORMAT_FLOAT;
-  desc.NumChannels = 1;
+    desc.NumChannels = 1;
-  desc.Width = width;
+    desc.Width       = width;
-  desc.Height = height;
+    desc.Height      = height;
-  checkCudaErrors(cuArrayCreate(&cu_array, &desc));
+    checkCudaErrors(cuArrayCreate(&cu_array, &desc));
-  CUDA_MEMCPY2D copyParam;
+    CUDA_MEMCPY2D copyParam;
-  memset(&copyParam, 0, sizeof(copyParam));
+    memset(&copyParam, 0, sizeof(copyParam));
-  copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+    copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-  copyParam.dstArray = cu_array;
+    copyParam.dstArray      = cu_array;
-  copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
+    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
-  copyParam.srcHost = h_data;
+    copyParam.srcHost       = h_data;
-  copyParam.srcPitch = width * sizeof(float);
+    copyParam.srcPitch      = width * sizeof(float);
-  copyParam.WidthInBytes = copyParam.srcPitch;
+    copyParam.WidthInBytes  = copyParam.srcPitch;
-  copyParam.Height = height;
+    copyParam.Height        = height;
-  checkCudaErrors(cuMemcpy2D(&copyParam));
+    checkCudaErrors(cuMemcpy2D(&copyParam));
-  // set texture parameters
+    // set texture parameters
-  CUtexObject TexObject;
+    CUtexObject        TexObject;
-  CUDA_RESOURCE_DESC ResDesc;
+    CUDA_RESOURCE_DESC ResDesc;
-  memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
+    memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
-  ResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+    ResDesc.resType          = CU_RESOURCE_TYPE_ARRAY;
-  ResDesc.res.array.hArray = cu_array;
+    ResDesc.res.array.hArray = cu_array;
-  CUDA_TEXTURE_DESC TexDesc;
+    CUDA_TEXTURE_DESC TexDesc;
-  memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
+    memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
-  TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
+    TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
-  TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
+    TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
-  TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
+    TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
-  TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
+    TexDesc.filterMode     = CU_TR_FILTER_MODE_LINEAR;
-  TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+    TexDesc.flags          = CU_TRSF_NORMALIZED_COORDINATES;
-  checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
+    checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
-  // There are two ways to launch CUDA kernels via the Driver API.
+    // There are two ways to launch CUDA kernels via the Driver API.
-  // In this CUDA Sample, we illustrate both ways to pass parameters
+    // In this CUDA Sample, we illustrate both ways to pass parameters
-  // and specify parameters.  By default we use the simpler method.
+    // and specify parameters.  By default we use the simpler method.
-  int block_size = 8;
+    int                 block_size = 8;
-  StopWatchInterface *timer = NULL;
+    StopWatchInterface *timer      = NULL;
-  if (1) {
+    if (1) {
-    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
+        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
-    // Launching (simpler method)
+        // Launching (simpler method)
-    void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
+        void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
        checkCudaErrors(cuLaunchKernel(
            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
        checkCudaErrors(cuCtxSynchronize());
        sdkCreateTimer(&timer);
        sdkStartTimer(&timer);
        // launch kernel again for performance measurement
        checkCudaErrors(cuLaunchKernel(
            transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
    }
    else {
        // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
        // Launching (advanced method)
        int  offset = 0;
        char argBuffer[256];
        // pass in launch parameters (not actually de-referencing CUdeviceptr).
        // CUdeviceptr is
        // storing the value of the parameters
        *((CUdeviceptr *)&argBuffer[offset]) = d_data;
        offset += sizeof(d_data);
        *((unsigned int *)&argBuffer[offset]) = width;
        offset += sizeof(width);
        *((unsigned int *)&argBuffer[offset]) = height;
        offset += sizeof(height);
        *((float *)&argBuffer[offset]) = angle;
        offset += sizeof(angle);
        *((CUtexObject *)&argBuffer[offset]) = TexObject;
        offset += sizeof(TexObject);
        void *kernel_launch_config[5] = {
            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
        // new CUDA 4.0 Driver API Kernel launch call (warmup)
        checkCudaErrors(cuLaunchKernel(transform,
                                       (width / block_size),
                                       (height / block_size),
                                       1,
                                       block_size,
                                       block_size,
                                       1,
                                       0,
                                       NULL,
                                       NULL,
                                       (void **)&kernel_launch_config));
        checkCudaErrors(cuCtxSynchronize());
        sdkCreateTimer(&timer);
        sdkStartTimer(&timer);
        // launch kernel again for performance measurement
        checkCudaErrors(cuLaunchKernel(transform,
                                       (width / block_size),
                                       (height / block_size),
                                       1,
                                       block_size,
                                       block_size,
                                       1,
                                       0,
                                       0,
                                       NULL,
                                       (void **)&kernel_launch_config));
    }
    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
                                   (height / block_size), 1, block_size,
                                   block_size, 1, 0, NULL, args, NULL));
    checkCudaErrors(cuCtxSynchronize());
-    sdkCreateTimer(&timer);
+    sdkStopTimer(&timer);
-    sdkStartTimer(&timer);
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
    sdkDeleteTimer(&timer);
-    // launch kernel again for performance measurement
+    // allocate mem for the result on host side
-    checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
+    float *h_odata = (float *)malloc(size);
-                                   (height / block_size), 1, block_size,
+    // copy result from device to host
-                                   block_size, 1, 0, NULL, args, NULL));
+    checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
  } else {
    // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
    // Launching (advanced method)
    int offset = 0;
    char argBuffer[256];
-    // pass in launch parameters (not actually de-referencing CUdeviceptr).
+    // write result to file
-    // CUdeviceptr is
+    char output_filename[1024];
-    // storing the value of the parameters
+    strcpy(output_filename, image_path);
-    *((CUdeviceptr *)&argBuffer[offset]) = d_data;
+    strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
-    offset += sizeof(d_data);
+    sdkSavePGM(output_filename, h_odata, width, height);
-    *((unsigned int *)&argBuffer[offset]) = width;
+    printf("Wrote '%s'\n", output_filename);
    offset += sizeof(width);
    *((unsigned int *)&argBuffer[offset]) = height;
    offset += sizeof(height);
    *((float *)&argBuffer[offset]) = angle;
    offset += sizeof(angle);
    *((CUtexObject *)&argBuffer[offset]) = TexObject;
    offset += sizeof(TexObject);
-    void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+    // write regression file if necessary
-                                     CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-                                     CU_LAUNCH_PARAM_END};
+        // write file for regression test
        sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
    }
    else {
        // We need to reload the data from disk, because it is inverted upon output
        sdkLoadPGM(output_filename, &h_odata, &width, &height);
-    // new CUDA 4.0 Driver API Kernel launch call (warmup)
+        printf("Comparing files\n");
-    checkCudaErrors(cuLaunchKernel(
+        printf("\toutput:    <%s>\n", output_filename);
-        transform, (width / block_size), (height / block_size), 1, block_size,
+        printf("\treference: <%s>\n", ref_path);
-        block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config));
+        bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
-    checkCudaErrors(cuCtxSynchronize());
+    }
    sdkCreateTimer(&timer);
    sdkStartTimer(&timer);
-    // launch kernel again for performance measurement
+    // cleanup memory
-    checkCudaErrors(cuLaunchKernel(
+    checkCudaErrors(cuTexObjectDestroy(TexObject));
-        transform, (width / block_size), (height / block_size), 1, block_size,
+    checkCudaErrors(cuMemFree(d_data));
-        block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config));
+    checkCudaErrors(cuArrayDestroy(cu_array));
  }
-  checkCudaErrors(cuCtxSynchronize());
+    free(image_path);
-  sdkStopTimer(&timer);
+    free(ref_path);
  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
  printf("%.2f Mpixels/sec\n",
         (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
  sdkDeleteTimer(&timer);
-  // allocate mem for the result on host side
+    checkCudaErrors(cuCtxDestroy(cuContext));
  float *h_odata = (float *)malloc(size);
  // copy result from device to host
  checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
-  // write result to file
+    exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
  char output_filename[1024];
  strcpy(output_filename, image_path);
  strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
  sdkSavePGM(output_filename, h_odata, width, height);
  printf("Wrote '%s'\n", output_filename);
  // write regression file if necessary
  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
    // write file for regression test
    sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
                        false);
  } else {
    // We need to reload the data from disk, because it is inverted upon output
    sdkLoadPGM(output_filename, &h_odata, &width, &height);
    printf("Comparing files\n");
    printf("\toutput:    <%s>\n", output_filename);
    printf("\treference: <%s>\n", ref_path);
    bTestResults = compareData(h_odata, h_data_ref, width * height,
                               MIN_EPSILON_ERROR, 0.15f);
  }
  // cleanup memory
  checkCudaErrors(cuTexObjectDestroy(TexObject));
  checkCudaErrors(cuMemFree(d_data));
  checkCudaErrors(cuArrayDestroy(cu_array));
  free(image_path);
  free(ref_path);
  checkCudaErrors(cuCtxDestroy(cuContext));
  exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -293,45 +307,44 @@ void runTest(int argc, char **argv) {
 //! kernel function.  After the module is loaded, cuModuleGetFunction
 //! retrieves the CUDA function pointer "cuFunction"
 ////////////////////////////////////////////////////////////////////////////////
-static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
+static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
-  CUfunction cuFunction = 0;
+{
-  int major = 0, minor = 0, devID = 0;
+    CUfunction cuFunction = 0;
-  char deviceName[100];
+    int        major = 0, minor = 0, devID = 0;
-  string module_path;
+    char       deviceName[100];
    string     module_path;
-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
-  // get compute capabilities and the devicename
+    // get compute capabilities and the devicename
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
-      &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
+    checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
-      &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
+    printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
  checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
  printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
-  // first search for the module_path before we try to load the results
+    // first search for the module_path before we try to load the results
-  std::ostringstream fatbin;
+    std::ostringstream fatbin;
-  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  } else {
+    }
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
+    else {
-  }
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
-  if (!fatbin.str().size()) {
+    if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
+        printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  // Create module from binary file (FATBIN)
+    // Create module from binary file (FATBIN)
-  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
+    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
-  checkCudaErrors(
+    checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
      cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
-  *transform = cuFunction;
+    *transform = cuFunction;
-  return CUDA_SUCCESS;
+    return CUDA_SUCCESS;
 }
--- a/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu
+++ b/Samples/0_Introduction/simpleTextureDrv/simpleTexture_kernel.cu
@ -33,23 +33,22 @@
 //! Transform an image using texture lookups
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-extern "C" __global__ void transformKernel(float *g_odata, int width,
+extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
-                                           int height, float theta,
+{
-                                           CUtexObject tex) {
+    // calculate normalized texture coordinates
-  // calculate normalized texture coordinates
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
-  float u = (float)x - (float)width / 2;
+    float u  = (float)x - (float)width / 2;
-  float v = (float)y - (float)height / 2;
+    float v  = (float)y - (float)height / 2;
-  float tu = u * cosf(theta) - v * sinf(theta);
+    float tu = u * cosf(theta) - v * sinf(theta);
-  float tv = v * cosf(theta) + u * sinf(theta);
+    float tv = v * cosf(theta) + u * sinf(theta);
-  tu /= (float)width;
+    tu /= (float)width;
-  tv /= (float)height;
+    tv /= (float)height;
-  // read from texture and write to global memory
+    // read from texture and write to global memory
-  g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
+    g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
 }
-#endif  // #ifndef _SIMPLETEXTURE_KERNEL_H_
+#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_
--- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu
+++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVoteIntrinsics.cu
@ -53,257 +53,237 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
 #include "simpleVote_kernel.cuh"
 // Generate the test pattern for Tests 1 and 2
-void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
+void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
-  // For testing VOTE.Any (all of these threads will return 0)
+{
-  for (int i = 0; i < size / 4; i++) {
+    // For testing VOTE.Any (all of these threads will return 0)
-    VOTE_PATTERN[i] = 0x00000000;
+    for (int i = 0; i < size / 4; i++) {
-  }
+        VOTE_PATTERN[i] = 0x00000000;
  // For testing VOTE.Any (1/2 these threads will return 1)
  for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
    VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
  }
  // For testing VOTE.all (1/2 of these threads will return 0)
  for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
    VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
  }
  // For testing VOTE.all (all of these threads will return 1)
  for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
    VOTE_PATTERN[i] = 0xffffffff;
  }
 }
 int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
                 const char *voteType) {
  int i, sum = 0;
  for (sum = 0, i = start; i < end; i++) {
    sum += h_result[i];
  }
  if (sum > 0) {
    printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
    for (i = start; i < end; i++) {
      printf("%d", h_result[i]);
    }
-    printf("%d values FAILED\n", sum);
+    // For testing VOTE.Any (1/2 these threads will return 1)
-  }
+    for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
-
+        VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
  return (sum > 0);
 }
 int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
                 const char *voteType) {
  int i, sum = 0;
  for (sum = 0, i = start; i < end; i++) {
    sum += h_result[i];
  }
  if (sum != warp_size) {
    printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
    for (i = start; i < end; i++) {
      printf("%d", h_result[i]);
    }
-    printf(" - FAILED\n");
+    // For testing VOTE.all (1/2 of these threads will return 0)
-  }
+    for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
        VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
    }
-  return (sum != warp_size);
+    // For testing VOTE.all (all of these threads will return 1)
    for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
        VOTE_PATTERN[i] = 0xffffffff;
    }
 }
 int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
 {
    int i, sum = 0;
    for (sum = 0, i = start; i < end; i++) {
        sum += h_result[i];
    }
    if (sum > 0) {
        printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
        for (i = start; i < end; i++) {
            printf("%d", h_result[i]);
        }
        printf("%d values FAILED\n", sum);
    }
    return (sum > 0);
 }
 int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
 {
    int i, sum = 0;
    for (sum = 0, i = start; i < end; i++) {
        sum += h_result[i];
    }
    if (sum != warp_size) {
        printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
        for (i = start; i < end; i++) {
            printf("%d", h_result[i]);
        }
        printf(" - FAILED\n");
    }
    return (sum != warp_size);
 }
 // Verification code for Kernel #1
-int checkResultsVoteAnyKernel1(unsigned int *h_result, int size,
+int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
-                               int warp_size) {
+{
-  int error_count = 0;
+    int error_count = 0;
-  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-                              warp_size, "Vote.Any");
+    error_count += checkErrors2(
-  error_count +=
+        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-      checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors2(
-                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
+        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-  error_count +=
+    error_count += checkErrors2(
-      checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
+        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
  error_count +=
      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
-  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
+    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
-  return error_count;
+    return error_count;
 }
 // Verification code for Kernel #2
-int checkResultsVoteAllKernel2(unsigned int *h_result, int size,
+int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
-                               int warp_size) {
+{
-  int error_count = 0;
+    int error_count = 0;
-  error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-                              warp_size, "Vote.All");
+    error_count += checkErrors1(
-  error_count +=
+        h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-      checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4,
+    error_count += checkErrors1(
-                   2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
+        h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-  error_count +=
+    error_count += checkErrors2(
-      checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
+        h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
                   3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
  error_count +=
      checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
                   4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
-  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
+    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
-  return error_count;
+    return error_count;
 }
 // Verification code for Kernel #3
-int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
+int checkResultsVoteAnyKernel3(bool *hinfo, int size)
-  int i, error_count = 0;
+{
    int i, error_count = 0;
-  for (i = 0; i < size * 3; i++) {
+    for (i = 0; i < size * 3; i++) {
-    switch (i % 3) {
+        switch (i % 3) {
-      case 0:
+        case 0:
-        // First warp should be all zeros.
+            // First warp should be all zeros.
-        if (hinfo[i] != (i >= size * 1)) {
+            if (hinfo[i] != (i >= size * 1)) {
-          error_count++;
+                error_count++;
            }
            break;
        case 1:
            // First warp and half of second should be all zeros.
            if (hinfo[i] != (i >= size * 3 / 2)) {
                error_count++;
            }
            break;
        case 2:
            // First two warps should be all zeros.
            if (hinfo[i] != (i >= size * 2)) {
                error_count++;
            }
            break;
        }
        break;
      case 1:
        // First warp and half of second should be all zeros.
        if (hinfo[i] != (i >= size * 3 / 2)) {
          error_count++;
        }
        break;
      case 2:
        // First two warps should be all zeros.
        if (hinfo[i] != (i >= size * 2)) {
          error_count++;
        }
        break;
    }
  }
-  printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
+    printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
-  return error_count;
+    return error_count;
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  unsigned int *h_input, *h_result;
+{
-  unsigned int *d_input, *d_result;
+    unsigned int *h_input, *h_result;
    unsigned int *d_input, *d_result;
-  bool *dinfo = NULL, *hinfo = NULL;
+    bool *dinfo = NULL, *hinfo = NULL;
-  int error_count[3] = {0, 0, 0};
+    int   error_count[3] = {0, 0, 0};
-  cudaDeviceProp deviceProp;
+    cudaDeviceProp deviceProp;
-  int devID, warp_size = 32;
+    int            devID, warp_size = 32;
-  printf("%s\n", sSDKsample);
+    printf("%s\n", sSDKsample);
-  // This will pick the best possible CUDA capable device
+    // This will pick the best possible CUDA capable device
-  devID = findCudaDevice(argc, (const char **)argv);
+    devID = findCudaDevice(argc, (const char **)argv);
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
-  // Statistics about the GPU device
+    // Statistics about the GPU device
-  printf(
+    printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
-      "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
+           deviceProp.multiProcessorCount,
-      deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
+           deviceProp.major,
           deviceProp.minor);
-  h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
+    h_input  = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
-                                   sizeof(unsigned int));
+    h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
-  h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
+    checkCudaErrors(
-                                    sizeof(unsigned int));
+        cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
-  checkCudaErrors(
+    checkCudaErrors(
-      cudaMalloc(reinterpret_cast<void **>(&d_input),
+        cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
-                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
+    genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
-  checkCudaErrors(
+    checkCudaErrors(
-      cudaMalloc(reinterpret_cast<void **>(&d_result),
+        cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));
                 VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
  genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
  checkCudaErrors(cudaMemcpy(d_input, h_input,
                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
                             cudaMemcpyHostToDevice));
-  // Start of Vote Any Test Kernel #1
+    // Start of Vote Any Test Kernel #1
-  printf("[VOTE Kernel Test 1/3]\n");
+    printf("[VOTE Kernel Test 1/3]\n");
-  printf("\tRunning <<Vote.Any>> kernel1 ...\n");
+    printf("\tRunning <<Vote.Any>> kernel1 ...\n");
-  {
+    {
-    checkCudaErrors(cudaDeviceSynchronize());
+        checkCudaErrors(cudaDeviceSynchronize());
-    dim3 gridBlock(1, 1);
+        dim3 gridBlock(1, 1);
-    dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
+        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
-    VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result,
+        VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
-                                               VOTE_DATA_GROUP * warp_size);
+        getLastCudaError("VoteAnyKernel() execution failed\n");
-    getLastCudaError("VoteAnyKernel() execution failed\n");
+        checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaDeviceSynchronize());
+    }
-  }
+    checkCudaErrors(
-  checkCudaErrors(cudaMemcpy(h_result, d_result,
+        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
+    error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
                             cudaMemcpyDeviceToHost));
  error_count[0] += checkResultsVoteAnyKernel1(
      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
-  // Start of Vote All Test Kernel #2
+    // Start of Vote All Test Kernel #2
-  printf("\n[VOTE Kernel Test 2/3]\n");
+    printf("\n[VOTE Kernel Test 2/3]\n");
-  printf("\tRunning <<Vote.All>> kernel2 ...\n");
+    printf("\tRunning <<Vote.All>> kernel2 ...\n");
-  {
+    {
-    checkCudaErrors(cudaDeviceSynchronize());
+        checkCudaErrors(cudaDeviceSynchronize());
-    dim3 gridBlock(1, 1);
+        dim3 gridBlock(1, 1);
-    dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
+        dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
-    VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result,
+        VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
-                                               VOTE_DATA_GROUP * warp_size);
+        getLastCudaError("VoteAllKernel() execution failed\n");
-    getLastCudaError("VoteAllKernel() execution failed\n");
+        checkCudaErrors(cudaDeviceSynchronize());
-    checkCudaErrors(cudaDeviceSynchronize());
+    }
-  }
+    checkCudaErrors(
-  checkCudaErrors(cudaMemcpy(h_result, d_result,
+        cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-                             VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
+    error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
                             cudaMemcpyDeviceToHost));
  error_count[1] += checkResultsVoteAllKernel2(
      h_result, VOTE_DATA_GROUP * warp_size, warp_size);
-  // Second Vote Kernel Test #3 (both Any/All)
+    // Second Vote Kernel Test #3 (both Any/All)
-  hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
+    hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
-  cudaMalloc(reinterpret_cast<void **>(&dinfo),
+    cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
-             warp_size * 3 * 3 * sizeof(bool));
+    cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);
  cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
             cudaMemcpyHostToDevice);
-  printf("\n[VOTE Kernel Test 3/3]\n");
+    printf("\n[VOTE Kernel Test 3/3]\n");
-  printf("\tRunning <<Vote.Any>> kernel3 ...\n");
+    printf("\tRunning <<Vote.Any>> kernel3 ...\n");
-  {
+    {
-    checkCudaErrors(cudaDeviceSynchronize());
+        checkCudaErrors(cudaDeviceSynchronize());
-    VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
+        VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
-    checkCudaErrors(cudaDeviceSynchronize());
+        checkCudaErrors(cudaDeviceSynchronize());
-  }
+    }
-  cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool),
+    cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);
             cudaMemcpyDeviceToHost);
-  error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
+    error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
-  // Now free these resources for Test #1,2
+    // Now free these resources for Test #1,2
-  checkCudaErrors(cudaFree(d_input));
+    checkCudaErrors(cudaFree(d_input));
-  checkCudaErrors(cudaFree(d_result));
+    checkCudaErrors(cudaFree(d_result));
-  free(h_input);
+    free(h_input);
-  free(h_result);
+    free(h_result);
-  // Free resources from Test #3
+    // Free resources from Test #3
-  free(hinfo);
+    free(hinfo);
-  cudaFree(dinfo);
+    cudaFree(dinfo);
-  printf("\tShutting down...\n");
+    printf("\tShutting down...\n");
-  return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0)
+    return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
             ? EXIT_SUCCESS
             : EXIT_FAILURE;
 }
--- a/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh
+++ b/Samples/0_Introduction/simpleVoteIntrinsics/simpleVote_kernel.cuh
@ -38,43 +38,44 @@
 // If ANY one of the threads (within the warp) of the predicated condition
 // returns a non-zero value, then all threads within this warp will return a
 // non-zero value
-__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
+__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
-                               int size) {
+{
-  int tx = threadIdx.x;
+    int tx = threadIdx.x;
-  int mask = 0xffffffff;
+    int mask   = 0xffffffff;
-  result[tx] = __any_sync(mask, input[tx]);
+    result[tx] = __any_sync(mask, input[tx]);
 }
 // Kernel #2 tests the across-the-warp vote(all) intrinsic.
 // If ALL of the threads (within the warp) of the predicated condition returns
 // a non-zero value, then all threads within this warp will return a non-zero
 // value
-__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
+__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
-                               int size) {
+{
-  int tx = threadIdx.x;
+    int tx = threadIdx.x;
-  int mask = 0xffffffff;
+    int mask   = 0xffffffff;
-  result[tx] = __all_sync(mask, input[tx]);
+    result[tx] = __all_sync(mask, input[tx]);
 }
 // Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
 // This kernel will test for conditions across warps, and within half warps
-__global__ void VoteAnyKernel3(bool *info, int warp_size) {
+__global__ void VoteAnyKernel3(bool *info, int warp_size)
-  int tx = threadIdx.x;
+{
-  unsigned int mask = 0xffffffff;
+    int          tx   = threadIdx.x;
-  bool *offs = info + (tx * 3);
+    unsigned int mask = 0xffffffff;
    bool        *offs = info + (tx * 3);
-  // The following should hold true for the second and third warp
+    // The following should hold true for the second and third warp
-  *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
+    *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
-  // The following should hold true for the "upper half" of the second warp,
+    // The following should hold true for the "upper half" of the second warp,
-  // and all of the third warp
+    // and all of the third warp
-  *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);
+    *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);
-  // The following should hold true for the third warp only
+    // The following should hold true for the third warp only
-  if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
+    if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
-    *(offs + 2) = true;
+        *(offs + 2) = true;
-  }
+    }
 }
 #endif
--- a/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu
+++ b/Samples/0_Introduction/simpleZeroCopy/simpleZeroCopy.cu
@ -41,12 +41,13 @@
 #endif
 /* Add two vectors on the GPU */
-__global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
+__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+{
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < N) {
+    if (idx < N) {
-    c[idx] = a[idx] + b[idx];
+        c[idx] = a[idx] + b[idx];
-  }
+    }
 }
 // Allocate generic memory with malloc() and pin it laster instead of using
@ -54,194 +55,196 @@ __global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
 bool bPinGenericMemory = false;
 // Macro to aligned up to the memory size in question
-#define MEMORY_ALIGNMENT 4096
+#define MEMORY_ALIGNMENT  4096
 #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  int n, nelem, deviceCount;
+{
-  int idev = 0;  // use default device 0
+    int            n, nelem, deviceCount;
-  char *device = NULL;
+    int            idev   = 0; // use default device 0
-  unsigned int flags;
+    char          *device = NULL;
-  size_t bytes;
+    unsigned int   flags;
-  float *a, *b, *c;           // Pinned memory allocated on the CPU
+    size_t         bytes;
-  float *a_UA, *b_UA, *c_UA;  // Non-4K Aligned Pinned memory on the CPU
+    float         *a, *b, *c;          // Pinned memory allocated on the CPU
-  float *d_a, *d_b, *d_c;     // Device pointers for mapped memory
+    float         *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU
-  float errorNorm, refNorm, ref, diff;
+    float         *d_a, *d_b, *d_c;    // Device pointers for mapped memory
-  cudaDeviceProp deviceProp;
+    float          errorNorm, refNorm, ref, diff;
    cudaDeviceProp deviceProp;
-  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
-    printf("Usage:  simpleZeroCopy [OPTION]\n\n");
+        printf("Usage:  simpleZeroCopy [OPTION]\n\n");
-    printf("Options:\n");
+        printf("Options:\n");
-    printf("  --device=[device #]  Specify the device to be used\n");
+        printf("  --device=[device #]  Specify the device to be used\n");
-    printf(
+        printf("  --use_generic_memory (optional) use generic page-aligned for system "
-        "  --use_generic_memory (optional) use generic page-aligned for system "
+               "memory\n");
-        "memory\n");
+        return EXIT_SUCCESS;
    return EXIT_SUCCESS;
  }
  /* Get the device selected by the user or default to 0, and then set it. */
  if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
    cudaGetDeviceCount(&deviceCount);
    idev = atoi(device);
    if (idev >= deviceCount || idev < 0) {
      fprintf(stderr,
              "Device number %d is invalid, will use default CUDA device 0.\n",
              idev);
      idev = 0;
    }
  }
-  // if GPU found supports SM 1.2, then continue, otherwise we exit
+    /* Get the device selected by the user or default to 0, and then set it. */
-  if (!checkCudaCapabilities(1, 2)) {
+    if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
-    exit(EXIT_SUCCESS);
+        cudaGetDeviceCount(&deviceCount);
-  }
+        idev = atoi(device);
-  if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
+        if (idev >= deviceCount || idev < 0) {
            fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
            idev = 0;
        }
    }
    // if GPU found supports SM 1.2, then continue, otherwise we exit
    if (!checkCudaCapabilities(1, 2)) {
        exit(EXIT_SUCCESS);
    }
    if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
 #if defined(__APPLE__) || defined(MACOSX)
-    bPinGenericMemory = false;  // Generic Pinning of System Paged memory is not
+        bPinGenericMemory = false; // Generic Pinning of System Paged memory is not
-                                // currently supported on Mac OSX
+                                   // currently supported on Mac OSX
 #else
-    bPinGenericMemory = true;
+        bPinGenericMemory = true;
 #endif
-  }
+    }
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
-    printf("> Using Generic System Paged Memory (malloc)\n");
+        printf("> Using Generic System Paged Memory (malloc)\n");
-  } else {
+    }
-    printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
+    else {
-  }
+        printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
    }
-  checkCudaErrors(cudaSetDevice(idev));
+    checkCudaErrors(cudaSetDevice(idev));
-  /* Verify the selected device supports mapped memory and set the device
+    /* Verify the selected device supports mapped memory and set the device
-     flags for mapping host memory. */
+       flags for mapping host memory. */
-  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));
 #if CUDART_VERSION >= 2020
-  if (!deviceProp.canMapHostMemory) {
+    if (!deviceProp.canMapHostMemory) {
-    fprintf(stderr, "Device %d does not support mapping CPU host memory!\n",
+        fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
-            idev);
+
        exit(EXIT_SUCCESS);
    }
    checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
 #else
    fprintf(stderr,
            "CUDART version %d.%d does not support "
            "<cudaDeviceProp.canMapHostMemory> field\n",
            ,
            CUDART_VERSION / 1000,
            (CUDART_VERSION % 100) / 10);
    exit(EXIT_SUCCESS);
  }
  checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
 #else
  fprintf(stderr,
          "CUDART version %d.%d does not support "
          "<cudaDeviceProp.canMapHostMemory> field\n",
          , CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
  exit(EXIT_SUCCESS);
 #endif
 #if CUDART_VERSION < 4000
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
-    fprintf(
+        fprintf(stderr,
-        stderr,
+                "CUDART version %d.%d does not support <cudaHostRegister> function\n",
-        "CUDART version %d.%d does not support <cudaHostRegister> function\n",
+                CUDART_VERSION / 1000,
-        CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
+                (CUDART_VERSION % 100) / 10);
-    exit(EXIT_SUCCESS);
+        exit(EXIT_SUCCESS);
-  }
+    }
 #endif
-  /* Allocate mapped CPU memory. */
+    /* Allocate mapped CPU memory. */
-  nelem = 1048576;
+    nelem = 1048576;
-  bytes = nelem * sizeof(float);
+    bytes = nelem * sizeof(float);
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
 #if CUDART_VERSION >= 4000
-    a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
+        a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
-    b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
+        b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
-    c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
+        c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
-    // We need to ensure memory is aligned to 4K (so we will need to padd memory
+        // We need to ensure memory is aligned to 4K (so we will need to padd memory
-    // accordingly)
+        // accordingly)
-    a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
+        a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
-    b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
+        b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
-    c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);
+        c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);
-    checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
+        checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
-    checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
+        checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
-    checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
+        checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
 #endif
-  } else {
+    }
    else {
 #if CUDART_VERSION >= 2020
-    flags = cudaHostAllocMapped;
+        flags = cudaHostAllocMapped;
-    checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
+        checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
-    checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
+        checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
-    checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
+        checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
 #endif
-  }
+    }
-  /* Initialize the vectors. */
+    /* Initialize the vectors. */
-  for (n = 0; n < nelem; n++) {
+    for (n = 0; n < nelem; n++) {
-    a[n] = rand() / (float)RAND_MAX;
+        a[n] = rand() / (float)RAND_MAX;
-    b[n] = rand() / (float)RAND_MAX;
+        b[n] = rand() / (float)RAND_MAX;
-  }
+    }
    /* Get the device pointers for the pinned CPU memory mapped into the GPU
       memory space. */
 #if CUDART_VERSION >= 2020
-  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
+    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
-  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
+    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
-  checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
+    checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
 #endif
-  /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
+    /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
-   */
+     */
-  printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
+    printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
-  dim3 block(256);
+    dim3 block(256);
-  dim3 grid((unsigned int)ceil(nelem / (float)block.x));
+    dim3 grid((unsigned int)ceil(nelem / (float)block.x));
-  vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
+    vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  getLastCudaError("vectorAddGPU() execution failed");
+    getLastCudaError("vectorAddGPU() execution failed");
-  /* Compare the results */
+    /* Compare the results */
-  printf("> Checking the results from vectorAddGPU() ...\n");
+    printf("> Checking the results from vectorAddGPU() ...\n");
-  errorNorm = 0.f;
+    errorNorm = 0.f;
-  refNorm = 0.f;
+    refNorm   = 0.f;
-  for (n = 0; n < nelem; n++) {
+    for (n = 0; n < nelem; n++) {
-    ref = a[n] + b[n];
+        ref  = a[n] + b[n];
-    diff = c[n] - ref;
+        diff = c[n] - ref;
-    errorNorm += diff * diff;
+        errorNorm += diff * diff;
-    refNorm += ref * ref;
+        refNorm += ref * ref;
-  }
+    }
-  errorNorm = (float)sqrt((double)errorNorm);
+    errorNorm = (float)sqrt((double)errorNorm);
-  refNorm = (float)sqrt((double)refNorm);
+    refNorm   = (float)sqrt((double)refNorm);
-  /* Memory clean up */
+    /* Memory clean up */
-  printf("> Releasing CPU memory...\n");
+    printf("> Releasing CPU memory...\n");
-  if (bPinGenericMemory) {
+    if (bPinGenericMemory) {
 #if CUDART_VERSION >= 4000
-    checkCudaErrors(cudaHostUnregister(a));
+        checkCudaErrors(cudaHostUnregister(a));
-    checkCudaErrors(cudaHostUnregister(b));
+        checkCudaErrors(cudaHostUnregister(b));
-    checkCudaErrors(cudaHostUnregister(c));
+        checkCudaErrors(cudaHostUnregister(c));
-    free(a_UA);
+        free(a_UA);
-    free(b_UA);
+        free(b_UA);
-    free(c_UA);
+        free(c_UA);
 #endif
-  } else {
+    }
    else {
 #if CUDART_VERSION >= 2020
-    checkCudaErrors(cudaFreeHost(a));
+        checkCudaErrors(cudaFreeHost(a));
-    checkCudaErrors(cudaFreeHost(b));
+        checkCudaErrors(cudaFreeHost(b));
-    checkCudaErrors(cudaFreeHost(c));
+        checkCudaErrors(cudaFreeHost(c));
 #endif
-  }
+    }
-  exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/systemWideAtomics/README.md
+++ b/Samples/0_Introduction/systemWideAtomics/README.md
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu
+++ b/Samples/0_Introduction/systemWideAtomics/systemWideAtomics.cu
@ -29,113 +29,111 @@
 * memory.
 */
 #include <cstdio>
 #include <ctime>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <math.h>
 #include <stdint.h>
 #include <cstdio>
 #include <ctime>
 #define min(a, b) (a) < (b) ? (a) : (b)
 #define max(a, b) (a) > (b) ? (a) : (b)
 #define LOOP_NUM 50
-__global__ void atomicKernel(int *atom_arr) {
+__global__ void atomicKernel(int *atom_arr)
-  unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
+{
    unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  for (int i = 0; i < LOOP_NUM; i++) {
+    for (int i = 0; i < LOOP_NUM; i++) {
-    // Atomic addition
+        // Atomic addition
-    atomicAdd_system(&atom_arr[0], 10);
+        atomicAdd_system(&atom_arr[0], 10);
-    // Atomic exchange
+        // Atomic exchange
-    atomicExch_system(&atom_arr[1], tid);
+        atomicExch_system(&atom_arr[1], tid);
-    // Atomic maximum
+        // Atomic maximum
-    atomicMax_system(&atom_arr[2], tid);
+        atomicMax_system(&atom_arr[2], tid);
-    // Atomic minimum
+        // Atomic minimum
-    atomicMin_system(&atom_arr[3], tid);
+        atomicMin_system(&atom_arr[3], tid);
-    // Atomic increment (modulo 17+1)
+        // Atomic increment (modulo 17+1)
-    atomicInc_system((unsigned int *)&atom_arr[4], 17);
+        atomicInc_system((unsigned int *)&atom_arr[4], 17);
-    // Atomic decrement
+        // Atomic decrement
-    atomicDec_system((unsigned int *)&atom_arr[5], 137);
+        atomicDec_system((unsigned int *)&atom_arr[5], 137);
-    // Atomic compare-and-swap
+        // Atomic compare-and-swap
-    atomicCAS_system(&atom_arr[6], tid - 1, tid);
+        atomicCAS_system(&atom_arr[6], tid - 1, tid);
-    // Bitwise atomic instructions
+        // Bitwise atomic instructions
-    // Atomic AND
+        // Atomic AND
-    atomicAnd_system(&atom_arr[7], 2 * tid + 7);
+        atomicAnd_system(&atom_arr[7], 2 * tid + 7);
-    // Atomic OR
+        // Atomic OR
-    atomicOr_system(&atom_arr[8], 1 << tid);
+        atomicOr_system(&atom_arr[8], 1 << tid);
-    // Atomic XOR
+        // Atomic XOR
-    atomicXor_system(&atom_arr[9], tid);
+        atomicXor_system(&atom_arr[9], tid);
-  }
+    }
 }
-void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
+void atomicKernel_CPU(int *atom_arr, int no_of_threads)
-  for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
+{
-    for (int j = 0; j < LOOP_NUM; j++) {
+    for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
-      // Atomic addition
+        for (int j = 0; j < LOOP_NUM; j++) {
-      __sync_fetch_and_add(&atom_arr[0], 10);
+            // Atomic addition
            __sync_fetch_and_add(&atom_arr[0], 10);
-      // Atomic exchange
+            // Atomic exchange
-      __sync_lock_test_and_set(&atom_arr[1], i);
+            __sync_lock_test_and_set(&atom_arr[1], i);
-      // Atomic maximum
+            // Atomic maximum
-      int old, expected;
+            int old, expected;
-      do {
+            do {
-        expected = atom_arr[2];
+                expected = atom_arr[2];
-        old = __sync_val_compare_and_swap(&atom_arr[2], expected,
+                old      = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
-                                          max(expected, i));
+            } while (old != expected);
      } while (old != expected);
-      // Atomic minimum
+            // Atomic minimum
-      do {
+            do {
-        expected = atom_arr[3];
+                expected = atom_arr[3];
-        old = __sync_val_compare_and_swap(&atom_arr[3], expected,
+                old      = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
-                                          min(expected, i));
+            } while (old != expected);
      } while (old != expected);
-      // Atomic increment (modulo 17+1)
+            // Atomic increment (modulo 17+1)
-      int limit = 17;
+            int limit = 17;
-      do {
+            do {
-        expected = atom_arr[4];
+                expected = atom_arr[4];
-        old = __sync_val_compare_and_swap(
+                old      = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
-            &atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
+            } while (old != expected);
      } while (old != expected);
-      // Atomic decrement
+            // Atomic decrement
-      limit = 137;
+            limit = 137;
-      do {
+            do {
-        expected = atom_arr[5];
+                expected = atom_arr[5];
-        old = __sync_val_compare_and_swap(
+                old      = __sync_val_compare_and_swap(
-            &atom_arr[5], expected,
+                    &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
-            ((expected == 0) || (expected > limit)) ? limit : expected - 1);
+            } while (old != expected);
      } while (old != expected);
-      // Atomic compare-and-swap
+            // Atomic compare-and-swap
-      __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
+            __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
-      // Bitwise atomic instructions
+            // Bitwise atomic instructions
-      // Atomic AND
+            // Atomic AND
-      __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
+            __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
-      // Atomic OR
+            // Atomic OR
-      __sync_fetch_and_or(&atom_arr[8], 1 << i);
+            __sync_fetch_and_or(&atom_arr[8], 1 << i);
-      // Atomic XOR
+            // Atomic XOR
-      // 11th element should be 0xff
+            // 11th element should be 0xff
-      __sync_fetch_and_xor(&atom_arr[9], i);
+            __sync_fetch_and_xor(&atom_arr[9], i);
        }
    }
  }
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -145,198 +143,201 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-int verify(int *testData, const int len) {
+int verify(int *testData, const int len)
-  int val = 0;
+{
    int val = 0;
-  for (int i = 0; i < len * LOOP_NUM; ++i) {
+    for (int i = 0; i < len * LOOP_NUM; ++i) {
-    val += 10;
+        val += 10;
  }
  if (val != testData[0]) {
    printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
    return false;
  }
  val = 0;
  bool found = false;
  for (int i = 0; i < len; ++i) {
    // second element should be a member of [0, len)
    if (i == testData[1]) {
      found = true;
      break;
    }
  }
-  if (!found) {
+    if (val != testData[0]) {
-    printf("atomicExch failed\n");
+        printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
-    return false;
+        return false;
  }
  val = -(1 << 8);
  for (int i = 0; i < len; ++i) {
    // third element should be len-1
    val = max(val, i);
  }
  if (val != testData[2]) {
    printf("atomicMax failed\n");
    return false;
  }
  val = 1 << 8;
  for (int i = 0; i < len; ++i) {
    val = min(val, i);
  }
  if (val != testData[3]) {
    printf("atomicMin failed\n");
    return false;
  }
  int limit = 17;
  val = 0;
  for (int i = 0; i < len * LOOP_NUM; ++i) {
    val = (val >= limit) ? 0 : val + 1;
  }
  if (val != testData[4]) {
    printf("atomicInc failed\n");
    return false;
  }
  limit = 137;
  val = 0;
  for (int i = 0; i < len * LOOP_NUM; ++i) {
    val = ((val == 0) || (val > limit)) ? limit : val - 1;
  }
  if (val != testData[5]) {
    printf("atomicDec failed\n");
    return false;
  }
  found = false;
  for (int i = 0; i < len; ++i) {
    // seventh element should be a member of [0, len)
    if (i == testData[6]) {
      found = true;
      break;
    }
  }
-  if (!found) {
+    val = 0;
    printf("atomicCAS failed\n");
    return false;
  }
-  val = 0xff;
+    bool found = false;
-  for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i) {
-    // 8th element should be 1
+        // second element should be a member of [0, len)
-    val &= (2 * i + 7);
+        if (i == testData[1]) {
-  }
+            found = true;
            break;
        }
    }
-  if (val != testData[7]) {
+    if (!found) {
-    printf("atomicAnd failed\n");
+        printf("atomicExch failed\n");
-    return false;
+        return false;
-  }
+    }
-  val = 0;
+    val = -(1 << 8);
-  for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i) {
-    // 9th element should be 0xff
+        // third element should be len-1
-    val |= (1 << i);
+        val = max(val, i);
-  }
+    }
-  if (val != testData[8]) {
+    if (val != testData[2]) {
-    printf("atomicOr failed\n");
+        printf("atomicMax failed\n");
-    return false;
+        return false;
-  }
+    }
-  val = 0xff;
+    val = 1 << 8;
-  for (int i = 0; i < len; ++i) {
+    for (int i = 0; i < len; ++i) {
-    // 11th element should be 0xff
+        val = min(val, i);
-    val ^= i;
+    }
  }
-  if (val != testData[9]) {
+    if (val != testData[3]) {
-    printf("atomicXor failed\n");
+        printf("atomicMin failed\n");
-    return false;
+        return false;
-  }
+    }
-  return true;
+    int limit = 17;
    val       = 0;
    for (int i = 0; i < len * LOOP_NUM; ++i) {
        val = (val >= limit) ? 0 : val + 1;
    }
    if (val != testData[4]) {
        printf("atomicInc failed\n");
        return false;
    }
    limit = 137;
    val   = 0;
    for (int i = 0; i < len * LOOP_NUM; ++i) {
        val = ((val == 0) || (val > limit)) ? limit : val - 1;
    }
    if (val != testData[5]) {
        printf("atomicDec failed\n");
        return false;
    }
    found = false;
    for (int i = 0; i < len; ++i) {
        // seventh element should be a member of [0, len)
        if (i == testData[6]) {
            found = true;
            break;
        }
    }
    if (!found) {
        printf("atomicCAS failed\n");
        return false;
    }
    val = 0xff;
    for (int i = 0; i < len; ++i) {
        // 8th element should be 1
        val &= (2 * i + 7);
    }
    if (val != testData[7]) {
        printf("atomicAnd failed\n");
        return false;
    }
    val = 0;
    for (int i = 0; i < len; ++i) {
        // 9th element should be 0xff
        val |= (1 << i);
    }
    if (val != testData[8]) {
        printf("atomicOr failed\n");
        return false;
    }
    val = 0xff;
    for (int i = 0; i < len; ++i) {
        // 11th element should be 0xff
        val ^= i;
    }
    if (val != testData[9]) {
        printf("atomicXor failed\n");
        return false;
    }
    return true;
 }
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  // set device
+{
-  cudaDeviceProp device_prop;
+    // set device
-  int dev_id = findCudaDevice(argc, (const char **)argv);
+    cudaDeviceProp device_prop;
-  checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
+    int            dev_id = findCudaDevice(argc, (const char **)argv);
    checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
-  if (!device_prop.managedMemory) {
+    if (!device_prop.managedMemory) {
-    // This samples requires being run on a device that supports Unified Memory
+        // This samples requires being run on a device that supports Unified Memory
-    fprintf(stderr, "Unified Memory not supported on this device\n");
+        fprintf(stderr, "Unified Memory not supported on this device\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
-  }
+    }
-  if (device_prop.computeMode == cudaComputeModeProhibited) {
+    if (device_prop.computeMode == cudaComputeModeProhibited) {
-    // This sample requires being run with a default or process exclusive mode
+        // This sample requires being run with a default or process exclusive mode
-    fprintf(stderr,
+        fprintf(stderr,
-            "This sample requires a device in either default or process "
+                "This sample requires a device in either default or process "
-            "exclusive mode\n");
+                "exclusive mode\n");
-    exit(EXIT_WAIVED);
+        exit(EXIT_WAIVED);
-  }
+    }
-  if (device_prop.major < 6) {
+    if (device_prop.major < 6) {
-    printf(
+        printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
-        "%s: requires a minimum CUDA compute 6.0 capability, waiving "
+               "testing.\n",
-        "testing.\n",
+               argv[0]);
-        argv[0]);
+        exit(EXIT_WAIVED);
-    exit(EXIT_WAIVED);
+    }
  }
-  unsigned int numThreads = 256;
+    unsigned int numThreads = 256;
-  unsigned int numBlocks = 64;
+    unsigned int numBlocks  = 64;
-  unsigned int numData = 10;
+    unsigned int numData    = 10;
-  int *atom_arr;
+    int *atom_arr;
-  if (device_prop.pageableMemoryAccess) {
+    if (device_prop.pageableMemoryAccess) {
-    printf("CAN access pageable memory\n");
+        printf("CAN access pageable memory\n");
-    atom_arr = (int *)malloc(sizeof(int) * numData);
+        atom_arr = (int *)malloc(sizeof(int) * numData);
-  } else {
+    }
-    printf("CANNOT access pageable memory\n");
+    else {
-    checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
+        printf("CANNOT access pageable memory\n");
-  }
+        checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
    }
-  for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;
+    for (unsigned int i = 0; i < numData; i++)
        atom_arr[i] = 0;
-  // To make the AND and XOR tests generate something other than 0...
+    // To make the AND and XOR tests generate something other than 0...
-  atom_arr[7] = atom_arr[9] = 0xff;
+    atom_arr[7] = atom_arr[9] = 0xff;
-  atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
+    atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
-  atomicKernel_CPU(atom_arr, numBlocks * numThreads);
+    atomicKernel_CPU(atom_arr, numBlocks * numThreads);
-  checkCudaErrors(cudaDeviceSynchronize());
+    checkCudaErrors(cudaDeviceSynchronize());
-  // Compute & verify reference solution
+    // Compute & verify reference solution
-  int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
+    int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
-  if (device_prop.pageableMemoryAccess) {
+    if (device_prop.pageableMemoryAccess) {
-    free(atom_arr);
+        free(atom_arr);
-  } else {
+    }
-    cudaFree(atom_arr);
+    else {
-  }
+        cudaFree(atom_arr);
    }
-  printf("systemWideAtomics completed, returned %s \n",
+    printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
-         testResult ? "OK" : "ERROR!");
+    exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
  exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/template/template.cu
+++ b/Samples/0_Introduction/template/template.cu
@ -31,10 +31,10 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 // includes CUDA
 #include <cuda_runtime.h>
@ -47,34 +47,34 @@
 // declaration, forward
 void runTest(int argc, char **argv);
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Simple test kernel for device functionality
 //! @param g_idata  input data in global memory
 //! @param g_odata  output data in global memory
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void testKernel(float *g_idata, float *g_odata) {
+__global__ void testKernel(float *g_idata, float *g_odata)
-  // shared memory
+{
-  // the size is determined by the host application
+    // shared memory
-  extern __shared__ float sdata[];
+    // the size is determined by the host application
    extern __shared__ float sdata[];
-  // access thread id
+    // access thread id
-  const unsigned int tid = threadIdx.x;
+    const unsigned int tid = threadIdx.x;
-  // access number of threads in this block
+    // access number of threads in this block
-  const unsigned int num_threads = blockDim.x;
+    const unsigned int num_threads = blockDim.x;
-  // read in input data from global memory
+    // read in input data from global memory
-  sdata[tid] = g_idata[tid];
+    sdata[tid] = g_idata[tid];
-  __syncthreads();
+    __syncthreads();
-  // perform some computations
+    // perform some computations
-  sdata[tid] = (float)num_threads * sdata[tid];
+    sdata[tid] = (float)num_threads * sdata[tid];
-  __syncthreads();
+    __syncthreads();
-  // write data to global memory
+    // write data to global memory
-  g_odata[tid] = sdata[tid];
+    g_odata[tid] = sdata[tid];
 }
 ////////////////////////////////////////////////////////////////////////////////
@ -85,81 +85,81 @@ int main(int argc, char **argv) { runTest(argc, argv); }
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test for CUDA
 ////////////////////////////////////////////////////////////////////////////////
-void runTest(int argc, char **argv) {
+void runTest(int argc, char **argv)
-  bool bTestResult = true;
+{
    bool bTestResult = true;
-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);
-  // use command-line specified CUDA device, otherwise use device with highest
+    // use command-line specified CUDA device, otherwise use device with highest
-  // Gflops/s
+    // Gflops/s
-  int devID = findCudaDevice(argc, (const char **)argv);
+    int devID = findCudaDevice(argc, (const char **)argv);
-  StopWatchInterface *timer = 0;
+    StopWatchInterface *timer = 0;
-  sdkCreateTimer(&timer);
+    sdkCreateTimer(&timer);
-  sdkStartTimer(&timer);
+    sdkStartTimer(&timer);
-  unsigned int num_threads = 32;
+    unsigned int num_threads = 32;
-  unsigned int mem_size = sizeof(float) * num_threads;
+    unsigned int mem_size    = sizeof(float) * num_threads;
-  // allocate host memory
+    // allocate host memory
-  float *h_idata = (float *)malloc(mem_size);
+    float *h_idata = (float *)malloc(mem_size);
-  // initalize the memory
+    // initalize the memory
-  for (unsigned int i = 0; i < num_threads; ++i) {
+    for (unsigned int i = 0; i < num_threads; ++i) {
-    h_idata[i] = (float)i;
+        h_idata[i] = (float)i;
-  }
+    }
-  // allocate device memory
+    // allocate device memory
-  float *d_idata;
+    float *d_idata;
-  checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
+    checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
-  // copy host memory to device
+    // copy host memory to device
-  checkCudaErrors(
+    checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
      cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
-  // allocate device memory for result
+    // allocate device memory for result
-  float *d_odata;
+    float *d_odata;
-  checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
+    checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
-  // setup execution parameters
+    // setup execution parameters
-  dim3 grid(1, 1, 1);
+    dim3 grid(1, 1, 1);
-  dim3 threads(num_threads, 1, 1);
+    dim3 threads(num_threads, 1, 1);
-  // execute the kernel
+    // execute the kernel
-  testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
+    testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
-  // check if kernel execution generated and error
+    // check if kernel execution generated and error
-  getLastCudaError("Kernel execution failed");
+    getLastCudaError("Kernel execution failed");
-  // allocate mem for the result on host side
+    // allocate mem for the result on host side
-  float *h_odata = (float *)malloc(mem_size);
+    float *h_odata = (float *)malloc(mem_size);
-  // copy result from device to host
+    // copy result from device to host
-  checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));
                             cudaMemcpyDeviceToHost));
-  sdkStopTimer(&timer);
+    sdkStopTimer(&timer);
-  printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
-  sdkDeleteTimer(&timer);
+    sdkDeleteTimer(&timer);
-  // compute reference solution
+    // compute reference solution
-  float *reference = (float *)malloc(mem_size);
+    float *reference = (float *)malloc(mem_size);
-  computeGold(reference, h_idata, num_threads);
+    computeGold(reference, h_idata, num_threads);
-  // check result
+    // check result
-  if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
+    if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
-    // write file for regression test
+        // write file for regression test
-    sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
+        sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
-  } else {
+    }
-    // custom output handling when no regression test running
+    else {
-    // in this case check if the result is equivalent to the expected solution
+        // custom output handling when no regression test running
-    bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
+        // in this case check if the result is equivalent to the expected solution
-  }
+        bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
    }
-  // cleanup memory
+    // cleanup memory
-  free(h_idata);
+    free(h_idata);
-  free(h_odata);
+    free(h_odata);
-  free(reference);
+    free(reference);
-  checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(d_idata));
-  checkCudaErrors(cudaFree(d_odata));
+    checkCudaErrors(cudaFree(d_odata));
-  exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
 }
--- a/Samples/0_Introduction/template/template_cpu.cpp
+++ b/Samples/0_Introduction/template/template_cpu.cpp
@ -26,8 +26,7 @@
 */
 // export C interface
-extern "C" void computeGold(float *reference, float *idata,
+extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
                            const unsigned int len);
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set
@ -36,10 +35,11 @@ extern "C" void computeGold(float *reference, float *idata,
 //! @param idata      input data as provided to device
 //! @param len        number of elements in reference / idata
 ////////////////////////////////////////////////////////////////////////////////
-void computeGold(float *reference, float *idata, const unsigned int len) {
+void computeGold(float *reference, float *idata, const unsigned int len)
-  const float f_len = static_cast<float>(len);
+{
    const float f_len = static_cast<float>(len);
-  for (unsigned int i = 0; i < len; ++i) {
+    for (unsigned int i = 0; i < len; ++i) {
-    reference[i] = idata[i] * f_len;
+        reference[i] = idata[i] * f_len;
-  }
+    }
 }
--- a/Samples/0_Introduction/vectorAdd/vectorAdd.cu
+++ b/Samples/0_Introduction/vectorAdd/vectorAdd.cu
@ -37,7 +37,6 @@
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 /**
 * CUDA Kernel Device code
@ -45,166 +44,153 @@
 * Computes the vector addition of A and B into C. The 3 vectors have the same
 * number of elements numElements.
 */
-__global__ void vectorAdd(const float *A, const float *B, float *C,
+__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
-                          int numElements) {
+{
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < numElements) {
+    if (i < numElements) {
-    C[i] = A[i] + B[i] + 0.0f;
+        C[i] = A[i] + B[i] + 0.0f;
-  }
+    }
 }
 /**
 * Host main routine
 */
-int main(void) {
+int main(void)
-  // Error code to check return values for CUDA calls
+{
-  cudaError_t err = cudaSuccess;
+    // Error code to check return values for CUDA calls
    cudaError_t err = cudaSuccess;
-  // Print the vector length to be used, and compute its size
+    // Print the vector length to be used, and compute its size
-  int numElements = 50000;
+    int    numElements = 50000;
-  size_t size = numElements * sizeof(float);
+    size_t size        = numElements * sizeof(float);
-  printf("[Vector addition of %d elements]\n", numElements);
+    printf("[Vector addition of %d elements]\n", numElements);
-  // Allocate the host input vector A
+    // Allocate the host input vector A
-  float *h_A = (float *)malloc(size);
+    float *h_A = (float *)malloc(size);
-  // Allocate the host input vector B
+    // Allocate the host input vector B
-  float *h_B = (float *)malloc(size);
+    float *h_B = (float *)malloc(size);
-  // Allocate the host output vector C
+    // Allocate the host output vector C
-  float *h_C = (float *)malloc(size);
+    float *h_C = (float *)malloc(size);
-  // Verify that allocations succeeded
+    // Verify that allocations succeeded
-  if (h_A == NULL || h_B == NULL || h_C == NULL) {
+    if (h_A == NULL || h_B == NULL || h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host vectors!\n");
+        fprintf(stderr, "Failed to allocate host vectors!\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  // Initialize the host input vectors
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand() / (float)RAND_MAX;
    h_B[i] = rand() / (float)RAND_MAX;
  }
  // Allocate the device input vector A
  float *d_A = NULL;
  err = cudaMalloc((void **)&d_A, size);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Allocate the device input vector B
  float *d_B = NULL;
  err = cudaMalloc((void **)&d_B, size);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Allocate the device output vector C
  float *d_C = NULL;
  err = cudaMalloc((void **)&d_C, size);
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Copy the host input vectors A and B in host memory to the device input
  // vectors in
  // device memory
  printf("Copy input data from the host memory to the CUDA device\n");
  err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    fprintf(stderr,
            "Failed to copy vector A from host to device (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    fprintf(stderr,
            "Failed to copy vector B from host to device (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Launch the Vector Add CUDA Kernel
  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         threadsPerBlock);
  vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
  err = cudaGetLastError();
  if (err != cudaSuccess) {
    fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Copy the device result vector in device memory to the host result vector
  // in host memory.
  printf("Copy output data from the CUDA device to the host memory\n");
  err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {
    fprintf(stderr,
            "Failed to copy vector C from device to host (error code %s)!\n",
            cudaGetErrorString(err));
    exit(EXIT_FAILURE);
  }
  // Verify that the result vector is correct
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }
-  printf("Test PASSED\n");
+    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = rand() / (float)RAND_MAX;
        h_B[i] = rand() / (float)RAND_MAX;
    }
-  // Free device global memory
+    // Allocate the device input vector A
-  err = cudaFree(d_A);
+    float *d_A = NULL;
    err        = cudaMalloc((void **)&d_A, size);
-  if (err != cudaSuccess) {
+    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
+        fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
-            cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  err = cudaFree(d_B);
+    // Allocate the device input vector B
    float *d_B = NULL;
    err        = cudaMalloc((void **)&d_B, size);
-  if (err != cudaSuccess) {
+    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
+        fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
-            cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  err = cudaFree(d_C);
+    // Allocate the device output vector C
    float *d_C = NULL;
    err        = cudaMalloc((void **)&d_C, size);
-  if (err != cudaSuccess) {
+    if (err != cudaSuccess) {
-    fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
+        fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
-            cudaGetErrorString(err));
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  // Free host memory
+    // Copy the host input vectors A and B in host memory to the device input
-  free(h_A);
+    // vectors in
-  free(h_B);
+    // device memory
-  free(h_C);
+    printf("Copy input data from the host memory to the CUDA device\n");
    err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
-  printf("Done\n");
+    if (err != cudaSuccess) {
-  return 0;
+        fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i) {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }
    printf("Test PASSED\n");
    // Free device global memory
    err = cudaFree(d_A);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_B);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    err = cudaFree(d_C);
    if (err != cudaSuccess) {
        fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
    printf("Done\n");
    return 0;
 }
--- a/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp
+++ b/Samples/0_Introduction/vectorAddDrv/vectorAddDrv.cpp
@ -34,11 +34,11 @@
 */
 // Includes
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <cstring>
 #include <cuda.h>
 #include <iostream>
 #include <stdio.h>
 #include <string.h>
 // includes, project
 #include <helper_cuda_drvapi.h>
@ -50,19 +50,19 @@
 using namespace std;
 // Variables
-CUdevice cuDevice;
+CUdevice    cuDevice;
-CUcontext cuContext;
+CUcontext   cuContext;
-CUmodule cuModule;
+CUmodule    cuModule;
-CUfunction vecAdd_kernel;
+CUfunction  vecAdd_kernel;
-float *h_A;
+float      *h_A;
-float *h_B;
+float      *h_B;
-float *h_C;
+float      *h_C;
 CUdeviceptr d_A;
 CUdeviceptr d_B;
 CUdeviceptr d_C;
 // Functions
-int CleanupNoFailure();
+int  CleanupNoFailure();
 void RandomInit(float *, int);
 bool findModulePath(const char *, string &, char **, string &);
@ -72,150 +72,152 @@ bool findModulePath(const char *, string &, char **, string &);
 #endif
 // Host code
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("Vector Addition (Driver API)\n");
+{
-  int N = 50000, devID = 0;
+    printf("Vector Addition (Driver API)\n");
-  size_t size = N * sizeof(float);
+    int    N = 50000, devID = 0;
    size_t size = N * sizeof(float);
-  // Initialize
+    // Initialize
-  checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuInit(0));
-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
-  // Create context
+    // Create context
-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
-  // first search for the module path before we load the results
+    // first search for the module path before we load the results
-  string module_path;
+    string module_path;
-  std::ostringstream fatbin;
+    std::ostringstream fatbin;
-  if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  } else {
+    }
-    printf("> initCUDA loading module: <%s>\n", module_path.c_str());
+    else {
-  }
+        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
-  if (!fatbin.str().size()) {
+    if (!fatbin.str().size()) {
-    printf("fatbin file empty. exiting..\n");
+        printf("fatbin file empty. exiting..\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
-  // Create module from binary file (FATBIN)
+    // Create module from binary file (FATBIN)
-  checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
+    checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
-  // Get function handle from module
+    // Get function handle from module
-  checkCudaErrors(
+    checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
      cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
-  // Allocate input vectors h_A and h_B in host memory
+    // Allocate input vectors h_A and h_B in host memory
-  h_A = (float *)malloc(size);
+    h_A = (float *)malloc(size);
-  h_B = (float *)malloc(size);
+    h_B = (float *)malloc(size);
-  h_C = (float *)malloc(size);
+    h_C = (float *)malloc(size);
-  // Initialize input vectors
+    // Initialize input vectors
-  RandomInit(h_A, N);
+    RandomInit(h_A, N);
-  RandomInit(h_B, N);
+    RandomInit(h_B, N);
-  // Allocate vectors in device memory
+    // Allocate vectors in device memory
-  checkCudaErrors(cuMemAlloc(&d_A, size));
+    checkCudaErrors(cuMemAlloc(&d_A, size));
-  checkCudaErrors(cuMemAlloc(&d_B, size));
+    checkCudaErrors(cuMemAlloc(&d_B, size));
-  checkCudaErrors(cuMemAlloc(&d_C, size));
+    checkCudaErrors(cuMemAlloc(&d_C, size));
-  // Copy vectors from host memory to device memory
+    // Copy vectors from host memory to device memory
-  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
+    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
-  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
+    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
-  if (1) {
+    if (1) {
-    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
+        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
-    // Launch (simpler method)
+        // Launch (simpler method)
-    // Grid/Block configuration
+        // Grid/Block configuration
-    int threadsPerBlock = 256;
+        int threadsPerBlock = 256;
-    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
-    void *args[] = {&d_A, &d_B, &d_C, &N};
+        void *args[] = {&d_A, &d_B, &d_C, &N};
-    // Launch the CUDA kernel
+        // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
+        checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
-                                   threadsPerBlock, 1, 1, 0, NULL, args, NULL));
+    }
-  } else {
+    else {
-    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
+        // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
-    // Launch (advanced method)
+        // Launch (advanced method)
-    int offset = 0;
+        int   offset = 0;
-    void *argBuffer[16];
+        void *argBuffer[16];
-    *((CUdeviceptr *)&argBuffer[offset]) = d_A;
+        *((CUdeviceptr *)&argBuffer[offset]) = d_A;
-    offset += sizeof(d_A);
+        offset += sizeof(d_A);
-    *((CUdeviceptr *)&argBuffer[offset]) = d_B;
+        *((CUdeviceptr *)&argBuffer[offset]) = d_B;
-    offset += sizeof(d_B);
+        offset += sizeof(d_B);
-    *((CUdeviceptr *)&argBuffer[offset]) = d_C;
+        *((CUdeviceptr *)&argBuffer[offset]) = d_C;
-    offset += sizeof(d_C);
+        offset += sizeof(d_C);
-    *((int *)&argBuffer[offset]) = N;
+        *((int *)&argBuffer[offset]) = N;
-    offset += sizeof(N);
+        offset += sizeof(N);
-    // Grid/Block configuration
+        // Grid/Block configuration
-    int threadsPerBlock = 256;
+        int threadsPerBlock = 256;
-    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
+        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
-    // Launch the CUDA kernel
+        // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
+        checkCudaErrors(
-                                   threadsPerBlock, 1, 1, 0, NULL, NULL,
+            cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
-                                   argBuffer));
+    }
  }
 #ifdef _DEBUG
-  checkCudaErrors(cuCtxSynchronize());
+    checkCudaErrors(cuCtxSynchronize());
 #endif
-  // Copy result from device memory to host memory
+    // Copy result from device memory to host memory
-  // h_C contains the result in host memory
+    // h_C contains the result in host memory
-  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
+    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
-  // Verify result
+    // Verify result
-  int i;
+    int i;
-  for (i = 0; i < N; ++i) {
+    for (i = 0; i < N; ++i) {
-    float sum = h_A[i] + h_B[i];
+        float sum = h_A[i] + h_B[i];
-    if (fabs(h_C[i] - sum) > 1e-7f) {
+        if (fabs(h_C[i] - sum) > 1e-7f) {
-      break;
+            break;
        }
    }
  }
-  CleanupNoFailure();
+    CleanupNoFailure();
-  printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
+    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
-  exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
-int CleanupNoFailure() {
+int CleanupNoFailure()
-  // Free device memory
+{
-  checkCudaErrors(cuMemFree(d_A));
+    // Free device memory
-  checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemFree(d_A));
-  checkCudaErrors(cuMemFree(d_C));
+    checkCudaErrors(cuMemFree(d_B));
    checkCudaErrors(cuMemFree(d_C));
-  // Free host memory
+    // Free host memory
-  if (h_A) {
+    if (h_A) {
-    free(h_A);
+        free(h_A);
-  }
+    }
-  if (h_B) {
+    if (h_B) {
-    free(h_B);
+        free(h_B);
-  }
+    }
-  if (h_C) {
+    if (h_C) {
-    free(h_C);
+        free(h_C);
-  }
+    }
-  checkCudaErrors(cuCtxDestroy(cuContext));
+    checkCudaErrors(cuCtxDestroy(cuContext));
-  return EXIT_SUCCESS;
+    return EXIT_SUCCESS;
 }
 // Allocates an array with random float entries.
-void RandomInit(float *data, int n) {
+void RandomInit(float *data, int n)
-  for (int i = 0; i < n; ++i) {
+{
-    data[i] = rand() / (float)RAND_MAX;
+    for (int i = 0; i < n; ++i) {
-  }
+        data[i] = rand() / (float)RAND_MAX;
    }
 }
--- a/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAddDrv/vectorAdd_kernel.cu
@ -33,9 +33,10 @@
 */
 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
-                                         float *C, int N) {
+{
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/vectorAddMMAP/README.md
+++ b/Samples/0_Introduction/vectorAddMMAP/README.md
@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
 Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## References (for more details)
--- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp
+++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.cpp
@ -29,172 +29,172 @@
 static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
-CUresult simpleMallocMultiDeviceMmap(
+CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
-    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
+                                     size_t                      *allocationSize,
-    const std::vector<CUdevice> &residentDevices,
+                                     size_t                       size,
-    const std::vector<CUdevice> &mappingDevices, size_t align) {
+                                     const std::vector<CUdevice> &residentDevices,
-  CUresult status = CUDA_SUCCESS;
+                                     const std::vector<CUdevice> &mappingDevices,
-  size_t min_granularity = 0;
+                                     size_t                       align)
-  size_t stripeSize;
+{
    CUresult status          = CUDA_SUCCESS;
    size_t   min_granularity = 0;
    size_t   stripeSize;
-  // Setup the properties common for all the chunks
+    // Setup the properties common for all the chunks
-  // The allocations will be device pinned memory.
+    // The allocations will be device pinned memory.
-  // This property structure describes the physical location where the memory
+    // This property structure describes the physical location where the memory
-  // will be allocated via cuMemCreate allong with additional properties In this
+    // will be allocated via cuMemCreate allong with additional properties In this
-  // case, the allocation will be pinnded device memory local to a given device.
+    // case, the allocation will be pinnded device memory local to a given device.
-  CUmemAllocationProp prop = {};
+    CUmemAllocationProp prop = {};
-  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    prop.type                = CU_MEM_ALLOCATION_TYPE_PINNED;
-  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    prop.location.type       = CU_MEM_LOCATION_TYPE_DEVICE;
-  // Get the minimum granularity needed for the resident devices
+    // Get the minimum granularity needed for the resident devices
-  // (the max of the minimum granularity of each participating device)
+    // (the max of the minimum granularity of each participating device)
-  for (int idx = 0; idx < residentDevices.size(); idx++) {
+    for (int idx = 0; idx < residentDevices.size(); idx++) {
-    size_t granularity = 0;
+        size_t granularity = 0;
-    // get the minnimum granularity for residentDevices[idx]
+        // get the minnimum granularity for residentDevices[idx]
-    prop.location.id = residentDevices[idx];
+        prop.location.id = residentDevices[idx];
-    status = cuMemGetAllocationGranularity(&granularity, &prop,
+        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
-                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+        if (status != CUDA_SUCCESS) {
-    if (status != CUDA_SUCCESS) {
+            goto done;
-      goto done;
+        }
-    }
+        if (min_granularity < granularity) {
-    if (min_granularity < granularity) {
+            min_granularity = granularity;
-      min_granularity = granularity;
+        }
    }
  }
  // Get the minimum granularity needed for the accessing devices
  // (the max of the minimum granularity of each participating device)
  for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
    size_t granularity = 0;
    // get the minnimum granularity for mappingDevices[idx]
    prop.location.id = mappingDevices[idx];
    status = cuMemGetAllocationGranularity(&granularity, &prop,
                                           CU_MEM_ALLOC_GRANULARITY_MINIMUM);
    if (status != CUDA_SUCCESS) {
      goto done;
    }
    if (min_granularity < granularity) {
      min_granularity = granularity;
    }
  }
  // Round up the size such that we can evenly split it into a stripe size tha
  // meets the granularity requirements Essentially size = N *
  // residentDevices.size() * min_granularity is the requirement, since each
  // piece of the allocation will be stripeSize = N * min_granularity and the
  // min_granularity requirement applies to each stripeSize piece of the
  // allocation.
  size = round_up(size, residentDevices.size() * min_granularity);
  stripeSize = size / residentDevices.size();
  // Return the rounded up size to the caller for use in the free
  if (allocationSize) {
    *allocationSize = size;
  }
  // Reserve the required contiguous VA space for the allocations
  status = cuMemAddressReserve(dptr, size, align, 0, 0);
  if (status != CUDA_SUCCESS) {
    goto done;
  }
  // Create and map the backings on each gpu
  // note: reusing CUmemAllocationProp prop from earlier with prop.type &
  // prop.location.type already specified.
  for (size_t idx = 0; idx < residentDevices.size(); idx++) {
    CUresult status2 = CUDA_SUCCESS;
    // Set the location for this chunk to this device
    prop.location.id = residentDevices[idx];
    // Create the allocation as a pinned allocation on this device
    CUmemGenericAllocationHandle allocationHandle;
    status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
    if (status != CUDA_SUCCESS) {
      goto done;
    }
-    // Assign the chunk to the appropriate VA range and release the handle.
+    // Get the minimum granularity needed for the accessing devices
-    // After mapping the memory, it can be referenced by virtual address.
+    // (the max of the minimum granularity of each participating device)
    // Since we do not need to make any other mappings of this memory or export
    // it, we no longer need and can release the allocationHandle. The
    // allocation will be kept live until it is unmapped.
    status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
                      allocationHandle, 0);
    // the handle needs to be released even if the mapping failed.
    status2 = cuMemRelease(allocationHandle);
    if (status == CUDA_SUCCESS) {
      // cuMemRelease should not have failed here
      // as the handle was just allocated successfully
      // however return an error if it does.
      status = status2;
    }
    // Cleanup in case of any mapping failures.
    if (status != CUDA_SUCCESS) {
      goto done;
    }
  }
  {
    // Each accessDescriptor will describe the mapping requirement for a single
    // device
    std::vector<CUmemAccessDesc> accessDescriptors;
    accessDescriptors.resize(mappingDevices.size());
    // Prepare the access descriptor array indicating where and how the backings
    // should be visible.
    for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
-      // Specify which device we are adding mappings for.
+        size_t granularity = 0;
      accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
      accessDescriptors[idx].location.id = mappingDevices[idx];
-      // Specify both read and write access.
+        // get the minnimum granularity for mappingDevices[idx]
-      accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        prop.location.id = mappingDevices[idx];
        status           = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
        if (status != CUDA_SUCCESS) {
            goto done;
        }
        if (min_granularity < granularity) {
            min_granularity = granularity;
        }
    }
-    // Apply the access descriptors to the whole VA range.
+    // Round up the size such that we can evenly split it into a stripe size tha
-    status = cuMemSetAccess(*dptr, size, &accessDescriptors[0],
+    // meets the granularity requirements Essentially size = N *
-                            accessDescriptors.size());
+    // residentDevices.size() * min_granularity is the requirement, since each
    // piece of the allocation will be stripeSize = N * min_granularity and the
    // min_granularity requirement applies to each stripeSize piece of the
    // allocation.
    size       = round_up(size, residentDevices.size() * min_granularity);
    stripeSize = size / residentDevices.size();
    // Return the rounded up size to the caller for use in the free
    if (allocationSize) {
        *allocationSize = size;
    }
    // Reserve the required contiguous VA space for the allocations
    status = cuMemAddressReserve(dptr, size, align, 0, 0);
    if (status != CUDA_SUCCESS) {
-      goto done;
+        goto done;
    }
    // Create and map the backings on each gpu
    // note: reusing CUmemAllocationProp prop from earlier with prop.type &
    // prop.location.type already specified.
    for (size_t idx = 0; idx < residentDevices.size(); idx++) {
        CUresult status2 = CUDA_SUCCESS;
        // Set the location for this chunk to this device
        prop.location.id = residentDevices[idx];
        // Create the allocation as a pinned allocation on this device
        CUmemGenericAllocationHandle allocationHandle;
        status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
        if (status != CUDA_SUCCESS) {
            goto done;
        }
        // Assign the chunk to the appropriate VA range and release the handle.
        // After mapping the memory, it can be referenced by virtual address.
        // Since we do not need to make any other mappings of this memory or export
        // it, we no longer need and can release the allocationHandle. The
        // allocation will be kept live until it is unmapped.
        status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
        // the handle needs to be released even if the mapping failed.
        status2 = cuMemRelease(allocationHandle);
        if (status == CUDA_SUCCESS) {
            // cuMemRelease should not have failed here
            // as the handle was just allocated successfully
            // however return an error if it does.
            status = status2;
        }
        // Cleanup in case of any mapping failures.
        if (status != CUDA_SUCCESS) {
            goto done;
        }
    }
    {
        // Each accessDescriptor will describe the mapping requirement for a single
        // device
        std::vector<CUmemAccessDesc> accessDescriptors;
        accessDescriptors.resize(mappingDevices.size());
        // Prepare the access descriptor array indicating where and how the backings
        // should be visible.
        for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
            // Specify which device we are adding mappings for.
            accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            accessDescriptors[idx].location.id   = mappingDevices[idx];
            // Specify both read and write access.
            accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
        }
        // Apply the access descriptors to the whole VA range.
        status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
        if (status != CUDA_SUCCESS) {
            goto done;
        }
    }
  }
 done:
-  if (status != CUDA_SUCCESS) {
+    if (status != CUDA_SUCCESS) {
-    if (*dptr) {
+        if (*dptr) {
-      simpleFreeMultiDeviceMmap(*dptr, size);
+            simpleFreeMultiDeviceMmap(*dptr, size);
        }
    }
  }
-  return status;
+    return status;
 }
-CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) {
+CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
-  CUresult status = CUDA_SUCCESS;
+{
    CUresult status = CUDA_SUCCESS;
-  // Unmap the mapped virtual memory region
+    // Unmap the mapped virtual memory region
-  // Since the handles to the mapped backing stores have already been released
+    // Since the handles to the mapped backing stores have already been released
-  // by cuMemRelease, and these are the only/last mappings referencing them,
+    // by cuMemRelease, and these are the only/last mappings referencing them,
-  // The backing stores will be freed.
+    // The backing stores will be freed.
-  // Since the memory has been unmapped after this call, accessing the specified
+    // Since the memory has been unmapped after this call, accessing the specified
-  // va range will result in a fault (unitll it is remapped).
+    // va range will result in a fault (unitll it is remapped).
-  status = cuMemUnmap(dptr, size);
+    status = cuMemUnmap(dptr, size);
-  if (status != CUDA_SUCCESS) {
+    if (status != CUDA_SUCCESS) {
-    return status;
+        return status;
-  }
+    }
-  // Free the virtual address region.  This allows the virtual address region
+    // Free the virtual address region.  This allows the virtual address region
-  // to be reused by future cuMemAddressReserve calls.  This also allows the
+    // to be reused by future cuMemAddressReserve calls.  This also allows the
-  // virtual address region to be used by other allocation made through
+    // virtual address region to be used by other allocation made through
-  // opperating system calls like malloc & mmap.
+    // opperating system calls like malloc & mmap.
-  status = cuMemAddressFree(dptr, size);
+    status = cuMemAddressFree(dptr, size);
-  if (status != CUDA_SUCCESS) {
+    if (status != CUDA_SUCCESS) {
-    return status;
+        return status;
-  }
+    }
-  return status;
+    return status;
 }
--- a/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp
+++ b/Samples/0_Introduction/vectorAddMMAP/multidevicealloc_memmap.hpp
@ -63,10 +63,12 @@
 //! handle
 //!   is not needed after its mappings are set up.
 ////////////////////////////////////////////////////////////////////////////
-CUresult simpleMallocMultiDeviceMmap(
+CUresult simpleMallocMultiDeviceMmap(CUdeviceptr                 *dptr,
-    CUdeviceptr *dptr, size_t *allocationSize, size_t size,
+                                     size_t                      *allocationSize,
-    const std::vector<CUdevice> &residentDevices,
+                                     size_t                       size,
-    const std::vector<CUdevice> &mappingDevices, size_t align = 0);
+                                     const std::vector<CUdevice> &residentDevices,
                                     const std::vector<CUdevice> &mappingDevices,
                                     size_t                       align = 0);
 ////////////////////////////////////////////////////////////////////////////
 //! Frees resources allocated by simpleMallocMultiDeviceMmap
--- a/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp
+++ b/Samples/0_Introduction/vectorAddMMAP/vectorAddMMAP.cpp
@ -36,11 +36,11 @@
 */
 // Includes
 #include <cstring>
 #include <cuda.h>
 #include <iostream>
 #include <stdio.h>
 #include <string.h>
 #include <cstring>
 #include <iostream>
 // includes, project
 #include <helper_cuda_drvapi.h>
@ -54,115 +54,111 @@
 using namespace std;
 // Variables
-CUdevice cuDevice;
+CUdevice    cuDevice;
-CUcontext cuContext;
+CUcontext   cuContext;
-CUmodule cuModule;
+CUmodule    cuModule;
-CUfunction vecAdd_kernel;
+CUfunction  vecAdd_kernel;
-float *h_A;
+float      *h_A;
-float *h_B;
+float      *h_B;
-float *h_C;
+float      *h_C;
 CUdeviceptr d_A;
 CUdeviceptr d_B;
 CUdeviceptr d_C;
-size_t allocationSize = 0;
+size_t      allocationSize = 0;
 // Functions
-int CleanupNoFailure();
+int  CleanupNoFailure();
 void RandomInit(float *, int);
-//define input fatbin file
+// define input fatbin file
 #ifndef FATBIN_FILE
 #define FATBIN_FILE "vectorAdd_kernel64.fatbin"
 #endif
 // collect all of the devices whose memory can be mapped from cuDevice.
-vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
+vector<CUdevice> getBackingDevices(CUdevice cuDevice)
-  int num_devices;
+{
    int num_devices;
-  checkCudaErrors(cuDeviceGetCount(&num_devices));
+    checkCudaErrors(cuDeviceGetCount(&num_devices));
-  vector<CUdevice> backingDevices;
+    vector<CUdevice> backingDevices;
-  backingDevices.push_back(cuDevice);
+    backingDevices.push_back(cuDevice);
-  for (int dev = 0; dev < num_devices; dev++) {
+    for (int dev = 0; dev < num_devices; dev++) {
-    int capable = 0;
+        int capable      = 0;
-    int attributeVal = 0;
+        int attributeVal = 0;
-    // The mapping device is already in the backingDevices vector
+        // The mapping device is already in the backingDevices vector
-    if (dev == cuDevice) {
+        if (dev == cuDevice) {
-      continue;
+            continue;
        }
        // Only peer capable devices can map each others memory
        checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
        if (!capable) {
            continue;
        }
        // The device needs to support virtual address management for the required
        // apis to work
        checkCudaErrors(
            cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
        if (attributeVal == 0) {
            continue;
        }
        backingDevices.push_back(dev);
    }
-
+    return backingDevices;
    // Only peer capable devices can map each others memory
    checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
    if (!capable) {
      continue;
    }
    // The device needs to support virtual address management for the required
    // apis to work
    checkCudaErrors(cuDeviceGetAttribute(
        &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
        cuDevice));
    if (attributeVal == 0) {
      continue;
    }
    backingDevices.push_back(dev);
  }
  return backingDevices;
 }
 // Host code
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  printf("Vector Addition (Driver API)\n");
+{
-  int N = 50000;
+    printf("Vector Addition (Driver API)\n");
-  size_t size = N * sizeof(float);
+    int    N            = 50000;
-  int attributeVal = 0;
+    size_t size         = N * sizeof(float);
    int    attributeVal = 0;
-  // Initialize
+    // Initialize
-  checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuInit(0));
-  cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
+    cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
-  // Check that the selected device supports virtual address management
+    // Check that the selected device supports virtual address management
-  checkCudaErrors(cuDeviceGetAttribute(
+    checkCudaErrors(
-      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+        cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
-      cuDevice));
+    printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
-  printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
+    if (attributeVal == 0) {
-         attributeVal);
+        printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
-  if (attributeVal == 0) {
+        exit(EXIT_WAIVED);
-    printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
+    }
    exit(EXIT_WAIVED);
  }
-  // The vector addition happens on cuDevice, so the allocations need to be
+    // The vector addition happens on cuDevice, so the allocations need to be
-  // mapped there.
+    // mapped there.
-  vector<CUdevice> mappingDevices;
+    vector<CUdevice> mappingDevices;
-  mappingDevices.push_back(cuDevice);
+    mappingDevices.push_back(cuDevice);
-  // Collect devices accessible by the mapping device (cuDevice) into the
+    // Collect devices accessible by the mapping device (cuDevice) into the
-  // backingDevices vector.
+    // backingDevices vector.
-  vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
+    vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
-  // Create context
+    // Create context
-  checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
+    checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
    // first search for the module path before we load the results
    string module_path;
    std::ostringstream fatbin;
-    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
+    if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
    {
        exit(EXIT_FAILURE);
    }
-    else
+    else {
    {
        printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }
-    if (!fatbin.str().size())
+    if (!fatbin.str().size()) {
    {
        printf("fatbin file empty. exiting..\n");
        exit(EXIT_FAILURE);
    }
@ -204,13 +200,10 @@ int main(int argc, char **argv) {
    int threadsPerBlock = 256;
    int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;
-    void *args[] = { &d_A, &d_B, &d_C, &N };
+    void *args[] = {&d_A, &d_B, &d_C, &N};
    // Launch the CUDA kernel
-    checkCudaErrors(cuLaunchKernel(vecAdd_kernel,  blocksPerGrid, 1, 1,
+    checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
                               threadsPerBlock, 1, 1,
                               0,
                               NULL, args, NULL));
    // Copy result from device memory to host memory
    // h_C contains the result in host memory
@ -219,20 +212,18 @@ int main(int argc, char **argv) {
    // Verify result
    int i;
-    for (i = 0; i < N; ++i)
+    for (i = 0; i < N; ++i) {
    {
        float sum = h_A[i] + h_B[i];
-        if (fabs(h_C[i] - sum) > 1e-7f)
+        if (fabs(h_C[i] - sum) > 1e-7f) {
        {
            break;
        }
    }
    CleanupNoFailure();
-    printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL");
+    printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
-    exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE);
+    exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
 }
 int CleanupNoFailure()
@ -243,18 +234,15 @@ int CleanupNoFailure()
    checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
    // Free host memory
-    if (h_A)
+    if (h_A) {
    {
        free(h_A);
    }
-    if (h_B)
+    if (h_B) {
    {
        free(h_B);
    }
-    if (h_C)
+    if (h_C) {
    {
        free(h_C);
    }
@ -265,8 +253,7 @@ int CleanupNoFailure()
 // Allocates an array with random float entries.
 void RandomInit(float *data, int n)
 {
-    for (int i = 0; i < n; ++i)
+    for (int i = 0; i < n; ++i) {
    {
        data[i] = rand() / (float)RAND_MAX;
    }
 }
--- a/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAddMMAP/vectorAdd_kernel.cu
@ -34,9 +34,10 @@
 */
 // Device code
-extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
+extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
-                                         float *C, int N) {
+{
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < N) C[i] = A[i] + B[i];
+    if (i < N)
        C[i] = A[i] + B[i];
 }
--- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp
+++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd.cpp
@ -33,8 +33,8 @@
 * of the programming guide with some additions like error checking.
 */
 #include <stdio.h>
 #include <cmath>
 #include <stdio.h>
 // For the CUDA runtime routines (prefixed with "cuda_")
 #include <cuda.h>
@ -42,112 +42,116 @@
 // helper functions and utilities to work with CUDA
 #include <helper_functions.h>
 #include <nvrtc_helper.h>
 /**
 * Host main routine
 */
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  char *cubin, *kernel_file;
+{
-  size_t cubinSize;
+    char  *cubin, *kernel_file;
-  kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
+    size_t cubinSize;
-  compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
+    kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
-  CUmodule module = loadCUBIN(cubin, argc, argv);
+    compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
    CUmodule module = loadCUBIN(cubin, argc, argv);
-  CUfunction kernel_addr;
+    CUfunction kernel_addr;
-  checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
+    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
-  // Print the vector length to be used, and compute its size
+    // Print the vector length to be used, and compute its size
-  int numElements = 50000;
+    int    numElements = 50000;
-  size_t size = numElements * sizeof(float);
+    size_t size        = numElements * sizeof(float);
-  printf("[Vector addition of %d elements]\n", numElements);
+    printf("[Vector addition of %d elements]\n", numElements);
-  // Allocate the host input vector A
+    // Allocate the host input vector A
-  float *h_A = reinterpret_cast<float *>(malloc(size));
+    float *h_A = reinterpret_cast<float *>(malloc(size));
-  // Allocate the host input vector B
+    // Allocate the host input vector B
-  float *h_B = reinterpret_cast<float *>(malloc(size));
+    float *h_B = reinterpret_cast<float *>(malloc(size));
-  // Allocate the host output vector C
+    // Allocate the host output vector C
-  float *h_C = reinterpret_cast<float *>(malloc(size));
+    float *h_C = reinterpret_cast<float *>(malloc(size));
-  // Verify that allocations succeeded
+    // Verify that allocations succeeded
-  if (h_A == NULL || h_B == NULL || h_C == NULL) {
+    if (h_A == NULL || h_B == NULL || h_C == NULL) {
-    fprintf(stderr, "Failed to allocate host vectors!\n");
+        fprintf(stderr, "Failed to allocate host vectors!\n");
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
  }
  // Initialize the host input vectors
  for (int i = 0; i < numElements; ++i) {
    h_A[i] = rand() / static_cast<float>(RAND_MAX);
    h_B[i] = rand() / static_cast<float>(RAND_MAX);
  }
  // Allocate the device input vector A
  CUdeviceptr d_A;
  checkCudaErrors(cuMemAlloc(&d_A, size));
  // Allocate the device input vector B
  CUdeviceptr d_B;
  checkCudaErrors(cuMemAlloc(&d_B, size));
  // Allocate the device output vector C
  CUdeviceptr d_C;
  checkCudaErrors(cuMemAlloc(&d_C, size));
  // Copy the host input vectors A and B in host memory to the device input
  // vectors in device memory
  printf("Copy input data from the host memory to the CUDA device\n");
  checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
  checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
  // Launch the Vector Add CUDA Kernel
  int threadsPerBlock = 256;
  int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
  printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
         threadsPerBlock);
  dim3 cudaBlockSize(threadsPerBlock, 1, 1);
  dim3 cudaGridSize(blocksPerGrid, 1, 1);
  void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
                 reinterpret_cast<void *>(&d_C),
                 reinterpret_cast<void *>(&numElements)};
  checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
                                 cudaGridSize.z, /* grid dim */
                                 cudaBlockSize.x, cudaBlockSize.y,
                                 cudaBlockSize.z, /* block dim */
                                 0, 0,            /* shared mem, stream */
                                 &arr[0],         /* arguments */
                                 0));
  checkCudaErrors(cuCtxSynchronize());
  // Copy the device result vector in device memory to the host result vector
  // in host memory.
  printf("Copy output data from the CUDA device to the host memory\n");
  checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
  // Verify that the result vector is correct
  for (int i = 0; i < numElements; ++i) {
    if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
      fprintf(stderr, "Result verification failed at element %d!\n", i);
      exit(EXIT_FAILURE);
    }
  }
-  printf("Test PASSED\n");
+    // Initialize the host input vectors
    for (int i = 0; i < numElements; ++i) {
        h_A[i] = rand() / static_cast<float>(RAND_MAX);
        h_B[i] = rand() / static_cast<float>(RAND_MAX);
    }
-  // Free device global memory
+    // Allocate the device input vector A
-  checkCudaErrors(cuMemFree(d_A));
+    CUdeviceptr d_A;
-  checkCudaErrors(cuMemFree(d_B));
+    checkCudaErrors(cuMemAlloc(&d_A, size));
  checkCudaErrors(cuMemFree(d_C));
-  // Free host memory
+    // Allocate the device input vector B
-  free(h_A);
+    CUdeviceptr d_B;
-  free(h_B);
+    checkCudaErrors(cuMemAlloc(&d_B, size));
  free(h_C);
-  printf("Done\n");
+    // Allocate the device output vector C
    CUdeviceptr d_C;
    checkCudaErrors(cuMemAlloc(&d_C, size));
-  return 0;
+    // Copy the host input vectors A and B in host memory to the device input
    // vectors in device memory
    printf("Copy input data from the host memory to the CUDA device\n");
    checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
    checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
    // Launch the Vector Add CUDA Kernel
    int threadsPerBlock = 256;
    int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;
    printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
    dim3 cudaBlockSize(threadsPerBlock, 1, 1);
    dim3 cudaGridSize(blocksPerGrid, 1, 1);
    void *arr[] = {reinterpret_cast<void *>(&d_A),
                   reinterpret_cast<void *>(&d_B),
                   reinterpret_cast<void *>(&d_C),
                   reinterpret_cast<void *>(&numElements)};
    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                   cudaGridSize.x,
                                   cudaGridSize.y,
                                   cudaGridSize.z, /* grid dim */
                                   cudaBlockSize.x,
                                   cudaBlockSize.y,
                                   cudaBlockSize.z, /* block dim */
                                   0,
                                   0,       /* shared mem, stream */
                                   &arr[0], /* arguments */
                                   0));
    checkCudaErrors(cuCtxSynchronize());
    // Copy the device result vector in device memory to the host result vector
    // in host memory.
    printf("Copy output data from the CUDA device to the host memory\n");
    checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
    // Verify that the result vector is correct
    for (int i = 0; i < numElements; ++i) {
        if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
            fprintf(stderr, "Result verification failed at element %d!\n", i);
            exit(EXIT_FAILURE);
        }
    }
    printf("Test PASSED\n");
    // Free device global memory
    checkCudaErrors(cuMemFree(d_A));
    checkCudaErrors(cuMemFree(d_B));
    checkCudaErrors(cuMemFree(d_C));
    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
    printf("Done\n");
    return 0;
 }
--- a/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu
+++ b/Samples/0_Introduction/vectorAdd_nvrtc/vectorAdd_kernel.cu
@ -32,11 +32,11 @@
 * number of elements numElements.
 */
-extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
+extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
-                                     int numElements) {
+{
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < numElements) {
+    if (i < numElements) {
-    C[i] = A[i] + B[i];
+        C[i] = A[i] + B[i];
-  }
+    }
 }
--- a/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu
+++ b/Samples/1_Utilities/bandwidthTest/bandwidthTest.cu
--- a/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
+++ b/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
@ -32,12 +32,11 @@
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <iostream>
 #include <memory>
 #include <string>
-int *pArgc = NULL;
+int   *pArgc = NULL;
 char **pArgv = NULL;
 #if CUDART_VERSION < 5000
@ -46,19 +45,16 @@ char **pArgv = NULL;
 #include <cuda.h>
 // This function wraps the CUDA Driver API into a template function
-template <class T>
+template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
-inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
+{
-                             int device) {
+    CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
  CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
-  if (CUDA_SUCCESS != error) {
+    if (CUDA_SUCCESS != error) {
-    fprintf(
+        fprintf(
-        stderr,
+            stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
        "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
        error, __FILE__, __LINE__);
-    exit(EXIT_FAILURE);
+        exit(EXIT_FAILURE);
-  }
+    }
 }
 #endif /* CUDART_VERSION < 5000 */
@ -66,278 +62,259 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  pArgc = &argc;
+{
-  pArgv = argv;
+    pArgc = &argc;
    pArgv = argv;
-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);
-  printf(
+    printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
      " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
-  int deviceCount = 0;
+    int         deviceCount = 0;
-  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
+    cudaError_t error_id    = cudaGetDeviceCount(&deviceCount);
-  if (error_id != cudaSuccess) {
+    if (error_id != cudaSuccess) {
-    printf("cudaGetDeviceCount returned %d\n-> %s\n",
+        printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
-           static_cast<int>(error_id), cudaGetErrorString(error_id));
+        printf("Result = FAIL\n");
-    printf("Result = FAIL\n");
+        exit(EXIT_FAILURE);
-    exit(EXIT_FAILURE);
+    }
  }
-  // This function call returns 0 if there are no CUDA capable devices.
+    // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
+    if (deviceCount == 0) {
-    printf("There are no available device(s) that support CUDA\n");
+        printf("There are no available device(s) that support CUDA\n");
-  } else {
+    }
-    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+    else {
-  }
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
    }
-  int dev, driverVersion = 0, runtimeVersion = 0;
+    int dev, driverVersion = 0, runtimeVersion = 0;
-  for (dev = 0; dev < deviceCount; ++dev) {
+    for (dev = 0; dev < deviceCount; ++dev) {
-    cudaSetDevice(dev);
+        cudaSetDevice(dev);
-    cudaDeviceProp deviceProp;
+        cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, dev);
+        cudaGetDeviceProperties(&deviceProp, dev);
-    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
+        printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
-    // Console log
+        // Console log
-    cudaDriverGetVersion(&driverVersion);
+        cudaDriverGetVersion(&driverVersion);
-    cudaRuntimeGetVersion(&runtimeVersion);
+        cudaRuntimeGetVersion(&runtimeVersion);
-    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
+        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
-           driverVersion / 1000, (driverVersion % 100) / 10,
+               driverVersion / 1000,
-           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
+               (driverVersion % 100) / 10,
-    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
+               runtimeVersion / 1000,
-           deviceProp.major, deviceProp.minor);
+               (runtimeVersion % 100) / 10);
        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
-    char msg[256];
+        char msg[256];
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    sprintf_s(msg, sizeof(msg),
+        sprintf_s(msg,
-              "  Total amount of global memory:                 %.0f MBytes "
+                  sizeof(msg),
-              "(%llu bytes)\n",
+                  "  Total amount of global memory:                 %.0f MBytes "
-              static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
+                  "(%llu bytes)\n",
-              (unsigned long long)deviceProp.totalGlobalMem);
+                  static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
                  (unsigned long long)deviceProp.totalGlobalMem);
 #else
-    snprintf(msg, sizeof(msg),
+        snprintf(msg,
-             "  Total amount of global memory:                 %.0f MBytes "
+                 sizeof(msg),
-             "(%llu bytes)\n",
+                 "  Total amount of global memory:                 %.0f MBytes "
-             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
+                 "(%llu bytes)\n",
-             (unsigned long long)deviceProp.totalGlobalMem);
+                 static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
                 (unsigned long long)deviceProp.totalGlobalMem);
 #endif
-    printf("%s", msg);
+        printf("%s", msg);
-    printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
+        printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
-           deviceProp.multiProcessorCount,
+               deviceProp.multiProcessorCount,
-           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
-           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
+               _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
-               deviceProp.multiProcessorCount);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
-    printf(
+               "GHz)\n",
-        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
+               deviceProp.clockRate * 1e-3f,
-        "GHz)\n",
+               deviceProp.clockRate * 1e-6f);
        deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
 #if CUDART_VERSION >= 5000
-    // This is supported in CUDA 5.0 (runtime API device properties)
+        // This is supported in CUDA 5.0 (runtime API device properties)
-    printf("  Memory Clock rate:                             %.0f Mhz\n",
+        printf("  Memory Clock rate:                             %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
-           deviceProp.memoryClockRate * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n", deviceProp.memoryBusWidth);
    printf("  Memory Bus Width:                              %d-bit\n",
           deviceProp.memoryBusWidth);
-    if (deviceProp.l2CacheSize) {
+        if (deviceProp.l2CacheSize) {
-      printf("  L2 Cache Size:                                 %d bytes\n",
+            printf("  L2 Cache Size:                                 %d bytes\n", deviceProp.l2CacheSize);
             deviceProp.l2CacheSize);
    }
 #else
    // This only available in CUDA 4.0-4.2 (but these were only exposed in the
    // CUDA Driver API)
    int memoryClock;
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                          dev);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
    int memBusWidth;
    getCudaAttribute<int>(&memBusWidth,
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
    int L2CacheSize;
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
    if (L2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             L2CacheSize);
    }
 #endif
    printf(
        "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
        "%d), 3D=(%d, %d, %d)\n",
        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
        deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
        deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
        "layers\n",
        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
        deviceProp.maxTexture2DLayered[2]);
    printf("  Total amount of constant memory:               %zu bytes\n",
           deviceProp.totalConstMem);
    printf("  Total amount of shared memory per block:       %zu bytes\n",
           deviceProp.sharedMemPerBlock);
    printf("  Total shared memory per multiprocessor:        %zu bytes\n",
           deviceProp.sharedMemPerMultiprocessor);
    printf("  Total number of registers available per block: %d\n",
           deviceProp.regsPerBlock);
    printf("  Warp size:                                     %d\n",
           deviceProp.warpSize);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           deviceProp.maxThreadsPerMultiProcessor);
    printf("  Maximum number of threads per block:           %d\n",
           deviceProp.maxThreadsPerBlock);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
           deviceProp.maxThreadsDim[2]);
    printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
           deviceProp.maxGridSize[2]);
    printf("  Maximum memory pitch:                          %zu bytes\n",
           deviceProp.memPitch);
    printf("  Texture alignment:                             %zu bytes\n",
           deviceProp.textureAlignment);
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
        "engine(s)\n",
        (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
    printf("  Run time limit on kernels:                     %s\n",
           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
    printf("  Integrated GPU sharing Host Memory:            %s\n",
           deviceProp.integrated ? "Yes" : "No");
    printf("  Support host page-locked memory mapping:       %s\n",
           deviceProp.canMapHostMemory ? "Yes" : "No");
    printf("  Alignment requirement for Surfaces:            %s\n",
           deviceProp.surfaceAlignment ? "Yes" : "No");
    printf("  Device has ECC support:                        %s\n",
           deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
           deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
                                : "WDDM (Windows Display Driver Model)");
 #endif
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           deviceProp.unifiedAddressing ? "Yes" : "No");
    printf("  Device supports Managed Memory:                %s\n",
           deviceProp.managedMemory ? "Yes" : "No");
    printf("  Device supports Compute Preemption:            %s\n",
           deviceProp.computePreemptionSupported ? "Yes" : "No");
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           deviceProp.cooperativeLaunch ? "Yes" : "No");
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
    const char *sComputeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
        "device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown", NULL};
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
  }
  // If there are 2 or more GPUs, query to determine whether RDMA is supported
  if (deviceCount >= 2) {
    cudaDeviceProp prop[64];
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
    int gpu_p2p_count = 0;
    for (int i = 0; i < deviceCount; i++) {
      checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
      // Only boards based on Fermi or later can support P2P
      if ((prop[i].major >= 2)
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
          // must be enabled to support this
          && prop[i].tccDriver
 #endif
          ) {
        // This is an array of P2P capable GPUs
        gpuid[gpu_p2p_count++] = i;
      }
    }
    // Show all the combinations of support P2P GPUs
    int can_access_peer;
    if (gpu_p2p_count >= 2) {
      for (int i = 0; i < gpu_p2p_count; i++) {
        for (int j = 0; j < gpu_p2p_count; j++) {
          if (gpuid[i] == gpuid[j]) {
            continue;
          }
          checkCudaErrors(
              cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
          printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
                 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
                 can_access_peer ? "Yes" : "No");
        }
-      }
+
 #else
        // This only available in CUDA 4.0-4.2 (but these were only exposed in the
        // CUDA Driver API)
        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
        if (L2CacheSize) {
            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
        }
 #endif
        printf("  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
               "%d), 3D=(%d, %d, %d)\n",
               deviceProp.maxTexture1D,
               deviceProp.maxTexture2D[0],
               deviceProp.maxTexture2D[1],
               deviceProp.maxTexture3D[0],
               deviceProp.maxTexture3D[1],
               deviceProp.maxTexture3D[2]);
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
               deviceProp.maxTexture1DLayered[0],
               deviceProp.maxTexture1DLayered[1]);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
               "layers\n",
               deviceProp.maxTexture2DLayered[0],
               deviceProp.maxTexture2DLayered[1],
               deviceProp.maxTexture2DLayered[2]);
        printf("  Total amount of constant memory:               %zu bytes\n", deviceProp.totalConstMem);
        printf("  Total amount of shared memory per block:       %zu bytes\n", deviceProp.sharedMemPerBlock);
        printf("  Total shared memory per multiprocessor:        %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
        printf("  Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
        printf("  Warp size:                                     %d\n", deviceProp.warpSize);
        printf("  Maximum number of threads per multiprocessor:  %d\n", deviceProp.maxThreadsPerMultiProcessor);
        printf("  Maximum number of threads per block:           %d\n", deviceProp.maxThreadsPerBlock);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxThreadsDim[0],
               deviceProp.maxThreadsDim[1],
               deviceProp.maxThreadsDim[2]);
        printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
               deviceProp.maxGridSize[0],
               deviceProp.maxGridSize[1],
               deviceProp.maxGridSize[2]);
        printf("  Maximum memory pitch:                          %zu bytes\n", deviceProp.memPitch);
        printf("  Texture alignment:                             %zu bytes\n", deviceProp.textureAlignment);
        printf("  Concurrent copy and kernel execution:          %s with %d copy "
               "engine(s)\n",
               (deviceProp.deviceOverlap ? "Yes" : "No"),
               deviceProp.asyncEngineCount);
        printf("  Run time limit on kernels:                     %s\n",
               deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
        printf("  Integrated GPU sharing Host Memory:            %s\n", deviceProp.integrated ? "Yes" : "No");
        printf("  Support host page-locked memory mapping:       %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
        printf("  Alignment requirement for Surfaces:            %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
        printf("  Device has ECC support:                        %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
               deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 #endif
        printf("  Device supports Unified Addressing (UVA):      %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
        printf("  Device supports Managed Memory:                %s\n", deviceProp.managedMemory ? "Yes" : "No");
        printf("  Device supports Compute Preemption:            %s\n",
               deviceProp.computePreemptionSupported ? "Yes" : "No");
        printf("  Supports Cooperative Kernel Launch:            %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
               deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
               deviceProp.pciDomainID,
               deviceProp.pciBusID,
               deviceProp.pciDeviceID);
        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
                                      "simultaneously)",
                                      "Exclusive (only one host thread in one process is able to use "
                                      "::cudaSetDevice() with this device)",
                                      "Prohibited (no host thread can use ::cudaSetDevice() with this "
                                      "device)",
                                      "Exclusive Process (many threads in one process is able to use "
                                      "::cudaSetDevice() with this device)",
                                      "Unknown",
                                      NULL};
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
    }
  }
-  // csv masterlog info
+    // If there are 2 or more GPUs, query to determine whether RDMA is supported
-  // *****************************
+    if (deviceCount >= 2) {
-  // exe and CUDA driver name
+        cudaDeviceProp prop[64];
-  printf("\n");
+        int            gpuid[64]; // we want to find the first two GPUs that can support P2P
-  std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
+        int            gpu_p2p_count = 0;
  char cTemp[16];
-  // driver version
+        for (int i = 0; i < deviceCount; i++) {
-  sProfileString += ", CUDA Driver Version = ";
+            checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
            // Only boards based on Fermi or later can support P2P
            if ((prop[i].major >= 2)
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
+                // on Windows (64-bit), the Tesla Compute Cluster driver for windows
-            (driverVersion % 100) / 10);
+                // must be enabled to support this
-#else
+                && prop[i].tccDriver
  snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
           (driverVersion % 100) / 10);
 #endif
-  sProfileString += cTemp;
+            ) {
                // This is an array of P2P capable GPUs
                gpuid[gpu_p2p_count++] = i;
            }
        }
-  // Runtime version
+        // Show all the combinations of support P2P GPUs
-  sProfileString += ", CUDA Runtime Version = ";
+        int can_access_peer;
        if (gpu_p2p_count >= 2) {
            for (int i = 0; i < gpu_p2p_count; i++) {
                for (int j = 0; j < gpu_p2p_count; j++) {
                    if (gpuid[i] == gpuid[j]) {
                        continue;
                    }
                    checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                    printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
                           prop[gpuid[i]].name,
                           gpuid[i],
                           prop[gpuid[j]].name,
                           gpuid[j],
                           can_access_peer ? "Yes" : "No");
                }
            }
        }
    }
    // csv masterlog info
    // *****************************
    // exe and CUDA driver name
    printf("\n");
    std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
    char        cTemp[16];
    // driver version
    sProfileString += ", CUDA Driver Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
+    sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
            (runtimeVersion % 100) / 10);
 #else
-  snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
+    snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
           (runtimeVersion % 100) / 10);
 #endif
-  sProfileString += cTemp;
+    sProfileString += cTemp;
-  // Device count
+    // Runtime version
-  sProfileString += ", NumDevs = ";
+    sProfileString += ", CUDA Runtime Version = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-  sprintf_s(cTemp, 10, "%d", deviceCount);
+    sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 #else
-  snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
+    snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
 #endif
-  sProfileString += cTemp;
+    sProfileString += cTemp;
  sProfileString += "\n";
  printf("%s", sProfileString.c_str());
-  printf("Result = PASS\n");
+    // Device count
    sProfileString += ", NumDevs = ";
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(cTemp, 10, "%d", deviceCount);
 #else
    snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
 #endif
    sProfileString += cTemp;
    sProfileString += "\n";
    printf("%s", sProfileString.c_str());
-  // finish
+    printf("Result = PASS\n");
-  exit(EXIT_SUCCESS);
+
    // finish
    exit(EXIT_SUCCESS);
 }
--- a/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp
+++ b/Samples/1_Utilities/deviceQueryDrv/deviceQueryDrv.cpp
@ -30,358 +30,295 @@
 */
 // includes, system
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <cuda.h>
 #include <helper_cuda_drvapi.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 ////////////////////////////////////////////////////////////////////////////////
 // Program main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  CUdevice dev;
+{
-  int major = 0, minor = 0;
+    CUdevice dev;
-  int deviceCount = 0;
+    int      major = 0, minor = 0;
-  char deviceName[256];
+    int      deviceCount = 0;
    char     deviceName[256];
-  printf("%s Starting...\n\n", argv[0]);
+    printf("%s Starting...\n\n", argv[0]);
-  // note your project will need to link with cuda.lib files on windows
+    // note your project will need to link with cuda.lib files on windows
-  printf("CUDA Device Query (Driver API) statically linked version \n");
+    printf("CUDA Device Query (Driver API) statically linked version \n");
-  checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuInit(0));
-  checkCudaErrors(cuDeviceGetCount(&deviceCount));
+    checkCudaErrors(cuDeviceGetCount(&deviceCount));
-  // This function call returns 0 if there are no CUDA capable devices.
+    // This function call returns 0 if there are no CUDA capable devices.
-  if (deviceCount == 0) {
+    if (deviceCount == 0) {
-    printf("There are no available device(s) that support CUDA\n");
+        printf("There are no available device(s) that support CUDA\n");
-  } else {
+    }
-    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
+    else {
-  }
+        printf("Detected %d CUDA Capable device(s)\n", deviceCount);
  for (dev = 0; dev < deviceCount; ++dev) {
    checkCudaErrors(cuDeviceGetAttribute(
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
    checkCudaErrors(cuDeviceGetAttribute(
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
    checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
    printf("\nDevice %d: \"%s\"\n", dev, deviceName);
    int driverVersion = 0;
    checkCudaErrors(cuDriverGetVersion(&driverVersion));
    printf("  CUDA Driver Version:                           %d.%d\n",
           driverVersion / 1000, (driverVersion % 100) / 10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major,
           minor);
    size_t totalGlobalMem;
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
    char msg[256];
    SPRINTF(msg,
            "  Total amount of global memory:                 %.0f MBytes "
            "(%llu bytes)\n",
            (float)totalGlobalMem / 1048576.0f,
            (unsigned long long)totalGlobalMem);
    printf("%s", msg);
    int multiProcessorCount;
    getCudaAttribute<int>(&multiProcessorCount,
                          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
           multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
           _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
    int clockRate;
    getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
    printf(
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
        "GHz)\n",
        clockRate * 1e-3f, clockRate * 1e-6f);
    int memoryClock;
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                          dev);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
    int memBusWidth;
    getCudaAttribute<int>(&memBusWidth,
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
    int L2CacheSize;
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
    if (L2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             L2CacheSize);
    }
-    int maxTex1D, maxTex2D[2], maxTex3D[3];
+    for (dev = 0; dev < deviceCount; ++dev) {
-    getCudaAttribute<int>(&maxTex1D,
+        checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
-                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
+        checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
    getCudaAttribute<int>(&maxTex2D[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex2D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex3D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
    printf(
        "  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
        "3D=(%d, %d, %d)\n",
        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
        maxTex3D[2]);
-    int maxTex1DLayered[2];
+        checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
    getCudaAttribute<int>(&maxTex1DLayered[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
                          dev);
    getCudaAttribute<int>(&maxTex1DLayered[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        maxTex1DLayered[0], maxTex1DLayered[1]);
-    int maxTex2DLayered[3];
+        printf("\nDevice %d: \"%s\"\n", dev, deviceName);
    getCudaAttribute<int>(&maxTex2DLayered[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
        "layers\n",
        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
-    int totalConstantMemory;
+        int driverVersion = 0;
-    getCudaAttribute<int>(&totalConstantMemory,
+        checkCudaErrors(cuDriverGetVersion(&driverVersion));
-                          CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
+        printf("  CUDA Driver Version:                           %d.%d\n",
-    printf("  Total amount of constant memory:               %u bytes\n",
+               driverVersion / 1000,
-           totalConstantMemory);
+               (driverVersion % 100) / 10);
-    int sharedMemPerBlock;
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);
    getCudaAttribute<int>(&sharedMemPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
    printf("  Total amount of shared memory per block:       %u bytes\n",
           sharedMemPerBlock);
    int regsPerBlock;
    getCudaAttribute<int>(&regsPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
    printf("  Total number of registers available per block: %d\n",
           regsPerBlock);
    int warpSize;
    getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
    printf("  Warp size:                                     %d\n", warpSize);
    int maxThreadsPerMultiProcessor;
    getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
                          dev);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           maxThreadsPerMultiProcessor);
    int maxThreadsPerBlock;
    getCudaAttribute<int>(&maxThreadsPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
    printf("  Maximum number of threads per block:           %d\n",
           maxThreadsPerBlock);
-    int blockDim[3];
+        size_t totalGlobalMem;
-    getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
+        checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
                          dev);
    getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                          dev);
    getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                          dev);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           blockDim[0], blockDim[1], blockDim[2]);
    int gridDim[3];
    getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
    getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
    getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
    printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n",
           gridDim[0], gridDim[1], gridDim[2]);
-    int textureAlign;
+        char msg[256];
-    getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
+        SPRINTF(msg,
-                          dev);
+                "  Total amount of global memory:                 %.0f MBytes "
-    printf("  Texture alignment:                             %u bytes\n",
+                "(%llu bytes)\n",
-           textureAlign);
+                (float)totalGlobalMem / 1048576.0f,
                (unsigned long long)totalGlobalMem);
        printf("%s", msg);
-    int memPitch;
+        int multiProcessorCount;
-    getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
+        getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
    printf("  Maximum memory pitch:                          %u bytes\n",
           memPitch);
-    int gpuOverlap;
+        printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
-    getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
+               multiProcessorCount,
               _ConvertSMVer2CoresDRV(major, minor),
               _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
-    int asyncEngineCount;
+        int clockRate;
-    getCudaAttribute<int>(&asyncEngineCount,
+        getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
-                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
+        printf("  GPU Max Clock rate:                            %.0f MHz (%0.2f "
-    printf(
+               "GHz)\n",
-        "  Concurrent copy and kernel execution:          %s with %d copy "
+               clockRate * 1e-3f,
-        "engine(s)\n",
+               clockRate * 1e-6f);
-        (gpuOverlap ? "Yes" : "No"), asyncEngineCount);
+        int memoryClock;
        getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
        printf("  Memory Clock rate:                             %.0f Mhz\n", memoryClock * 1e-3f);
        int memBusWidth;
        getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
        int L2CacheSize;
        getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
-    int kernelExecTimeoutEnabled;
+        if (L2CacheSize) {
-    getCudaAttribute<int>(&kernelExecTimeoutEnabled,
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
                          CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
    printf("  Run time limit on kernels:                     %s\n",
           kernelExecTimeoutEnabled ? "Yes" : "No");
    int integrated;
    getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
    printf("  Integrated GPU sharing Host Memory:            %s\n",
           integrated ? "Yes" : "No");
    int canMapHostMemory;
    getCudaAttribute<int>(&canMapHostMemory,
                          CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
    printf("  Support host page-locked memory mapping:       %s\n",
           canMapHostMemory ? "Yes" : "No");
    int concurrentKernels;
    getCudaAttribute<int>(&concurrentKernels,
                          CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
    printf("  Concurrent kernel execution:                   %s\n",
           concurrentKernels ? "Yes" : "No");
    int surfaceAlignment;
    getCudaAttribute<int>(&surfaceAlignment,
                          CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
    printf("  Alignment requirement for Surfaces:            %s\n",
           surfaceAlignment ? "Yes" : "No");
    int eccEnabled;
    getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
    printf("  Device has ECC support:                        %s\n",
           eccEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    int tccDriver;
    getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
           tccDriver ? "TCC (Tesla Compute Cluster Driver)"
                     : "WDDM (Windows Display Driver Model)");
 #endif
    int unifiedAddressing;
    getCudaAttribute<int>(&unifiedAddressing,
                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           unifiedAddressing ? "Yes" : "No");
    int managedMemory;
    getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
                          dev);
    printf("  Device supports Managed Memory:                %s\n",
           managedMemory ? "Yes" : "No");
    int computePreemption;
    getCudaAttribute<int>(&computePreemption,
                          CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
                          dev);
    printf("  Device supports Compute Preemption:            %s\n",
           computePreemption ? "Yes" : "No");
    int cooperativeLaunch;
    getCudaAttribute<int>(&cooperativeLaunch,
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           cooperativeLaunch ? "Yes" : "No");
    int cooperativeMultiDevLaunch;
    getCudaAttribute<int>(&cooperativeMultiDevLaunch,
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
                          dev);
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           cooperativeMultiDevLaunch ? "Yes" : "No");
    int pciDomainID, pciBusID, pciDeviceID;
    getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
    getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
    getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           pciDomainID, pciBusID, pciDeviceID);
    const char *sComputeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
        "device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown", NULL};
    int computeMode;
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[computeMode]);
  }
  // If there are 2 or more GPUs, query to determine whether RDMA is supported
  if (deviceCount >= 2) {
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
    int gpu_p2p_count = 0;
    int tccDriver = 0;
    for (int i = 0; i < deviceCount; i++) {
      checkCudaErrors(cuDeviceGetAttribute(
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
      getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
      // Only boards based on Fermi or later can support P2P
      if ((major >= 2)
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
          // must be enabled to support this
          && tccDriver
 #endif
          ) {
        // This is an array of P2P capable GPUs
        gpuid[gpu_p2p_count++] = i;
      }
    }
    // Show all the combinations of support P2P GPUs
    int can_access_peer;
    char deviceName0[256], deviceName1[256];
    if (gpu_p2p_count >= 2) {
      for (int i = 0; i < gpu_p2p_count; i++) {
        for (int j = 0; j < gpu_p2p_count; j++) {
          if (gpuid[i] == gpuid[j]) {
            continue;
          }
          checkCudaErrors(
              cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
          checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
          checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
          printf(
              "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
              "%s\n",
              deviceName0, gpuid[i], deviceName1, gpuid[j],
              can_access_peer ? "Yes" : "No");
        }
-      }
+
        int maxTex1D, maxTex2D[2], maxTex3D[3];
        getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
        getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
        getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
        getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
        getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
        getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
        printf("  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
               "3D=(%d, %d, %d)\n",
               maxTex1D,
               maxTex2D[0],
               maxTex2D[1],
               maxTex3D[0],
               maxTex3D[1],
               maxTex3D[2]);
        int maxTex1DLayered[2];
        getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
        getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
        printf("  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
               maxTex1DLayered[0],
               maxTex1DLayered[1]);
        int maxTex2DLayered[3];
        getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
        getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
        getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
        printf("  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
               "layers\n",
               maxTex2DLayered[0],
               maxTex2DLayered[1],
               maxTex2DLayered[2]);
        int totalConstantMemory;
        getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
        int sharedMemPerBlock;
        getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
        int regsPerBlock;
        getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
        printf("  Total number of registers available per block: %d\n", regsPerBlock);
        int warpSize;
        getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
        printf("  Warp size:                                     %d\n", warpSize);
        int maxThreadsPerMultiProcessor;
        getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
        printf("  Maximum number of threads per multiprocessor:  %d\n", maxThreadsPerMultiProcessor);
        int maxThreadsPerBlock;
        getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
        printf("  Maximum number of threads per block:           %d\n", maxThreadsPerBlock);
        int blockDim[3];
        getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
        getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
        getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
        printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
        int gridDim[3];
        getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
        getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
        getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
        printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
        int textureAlign;
        getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
        printf("  Texture alignment:                             %u bytes\n", textureAlign);
        int memPitch;
        getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
        int gpuOverlap;
        getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
        int asyncEngineCount;
        getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
        printf("  Concurrent copy and kernel execution:          %s with %d copy "
               "engine(s)\n",
               (gpuOverlap ? "Yes" : "No"),
               asyncEngineCount);
        int kernelExecTimeoutEnabled;
        getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
        int integrated;
        getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
        printf("  Integrated GPU sharing Host Memory:            %s\n", integrated ? "Yes" : "No");
        int canMapHostMemory;
        getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
        int concurrentKernels;
        getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
        int surfaceAlignment;
        getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
        printf("  Alignment requirement for Surfaces:            %s\n", surfaceAlignment ? "Yes" : "No");
        int eccEnabled;
        getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
        printf("  Device has ECC support:                        %s\n", eccEnabled ? "Enabled" : "Disabled");
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
        int tccDriver;
        getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
        printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
               tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
 #endif
        int unifiedAddressing;
        getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
        printf("  Device supports Unified Addressing (UVA):      %s\n", unifiedAddressing ? "Yes" : "No");
        int managedMemory;
        getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
        printf("  Device supports Managed Memory:                %s\n", managedMemory ? "Yes" : "No");
        int computePreemption;
        getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
        printf("  Device supports Compute Preemption:            %s\n", computePreemption ? "Yes" : "No");
        int cooperativeLaunch;
        getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
        printf("  Supports Cooperative Kernel Launch:            %s\n", cooperativeLaunch ? "Yes" : "No");
        int cooperativeMultiDevLaunch;
        getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
        printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
        int pciDomainID, pciBusID, pciDeviceID;
        getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
        getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
        getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
        printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
        const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
                                      "simultaneously)",
                                      "Exclusive (only one host thread in one process is able to use "
                                      "::cudaSetDevice() with this device)",
                                      "Prohibited (no host thread can use ::cudaSetDevice() with this "
                                      "device)",
                                      "Exclusive Process (many threads in one process is able to use "
                                      "::cudaSetDevice() with this device)",
                                      "Unknown",
                                      NULL};
        int computeMode;
        getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
        printf("  Compute Mode:\n");
        printf("     < %s >\n", sComputeMode[computeMode]);
    }
  }
-  printf("Result = PASS\n");
+    // If there are 2 or more GPUs, query to determine whether RDMA is supported
    if (deviceCount >= 2) {
        int gpuid[64]; // we want to find the first two GPUs that can support P2P
        int gpu_p2p_count = 0;
        int tccDriver     = 0;
-  exit(EXIT_SUCCESS);
+        for (int i = 0; i < deviceCount; i++) {
            checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
            checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
            getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
            // Only boards based on Fermi or later can support P2P
            if ((major >= 2)
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
                // on Windows (64-bit), the Tesla Compute Cluster driver for windows
                // must be enabled to support this
                && tccDriver
 #endif
            ) {
                // This is an array of P2P capable GPUs
                gpuid[gpu_p2p_count++] = i;
            }
        }
        // Show all the combinations of support P2P GPUs
        int  can_access_peer;
        char deviceName0[256], deviceName1[256];
        if (gpu_p2p_count >= 2) {
            for (int i = 0; i < gpu_p2p_count; i++) {
                for (int j = 0; j < gpu_p2p_count; j++) {
                    if (gpuid[i] == gpuid[j]) {
                        continue;
                    }
                    checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
                    checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
                    checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
                    printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
                           "%s\n",
                           deviceName0,
                           gpuid[i],
                           deviceName1,
                           gpuid[j],
                           can_access_peer ? "Yes" : "No");
                }
            }
        }
    }
    printf("Result = PASS\n");
    exit(EXIT_SUCCESS);
 }
--- a/Samples/1_Utilities/topologyQuery/README.md
+++ b/Samples/1_Utilities/topologyQuery/README.md
@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
 Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 ## References (for more details)
--- a/Samples/1_Utilities/topologyQuery/topologyQuery.cu
+++ b/Samples/1_Utilities/topologyQuery/topologyQuery.cu
@ -35,48 +35,44 @@
 // includes, project
 #include <helper_cuda.h>
-#include <helper_functions.h>  // helper for shared that are common to CUDA Samples
+#include <helper_functions.h> // helper for shared that are common to CUDA Samples
-int main(int argc, char **argv) {
+int main(int argc, char **argv)
-  int deviceCount = 0;
+{
-  checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+    int deviceCount = 0;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
-  // Enumerates Device <-> Device links
+    // Enumerates Device <-> Device links
-  for (int device1 = 0; device1 < deviceCount; device1++) {
+    for (int device1 = 0; device1 < deviceCount; device1++) {
-    for (int device2 = 0; device2 < deviceCount; device2++) {
+        for (int device2 = 0; device2 < deviceCount; device2++) {
-      if (device1 == device2) continue;
+            if (device1 == device2)
                continue;
-      int perfRank = 0;
+            int perfRank        = 0;
-      int atomicSupported = 0;
+            int atomicSupported = 0;
-      int accessSupported = 0;
+            int accessSupported = 0;
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
+            checkCudaErrors(
-          &accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
+                cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
+            checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
-          &perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
+            checkCudaErrors(
-      checkCudaErrors(cudaDeviceGetP2PAttribute(
+                cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));
          &atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
          device2));
-      if (accessSupported) {
+            if (accessSupported) {
-        std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":"
+                std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
-                  << std::endl;
+                std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
-        std::cout << "  * Atomic Supported: "
+                std::cout << "  * Perf Rank: " << perfRank << std::endl;
-                  << (atomicSupported ? "yes" : "no") << std::endl;
+            }
-        std::cout << "  * Perf Rank: " << perfRank << std::endl;
+        }
      }
    }
  }
-  // Enumerates Device <-> Host links
+    // Enumerates Device <-> Host links
-  for (int device = 0; device < deviceCount; device++) {
+    for (int device = 0; device < deviceCount; device++) {
-    int atomicSupported = 0;
+        int atomicSupported = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(
+        checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
-        &atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
+        std::cout << "GPU" << device << " <-> CPU:" << std::endl;
-    std::cout << "GPU" << device << " <-> CPU:" << std::endl;
+        std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
-    std::cout << "  * Atomic Supported: " << (atomicSupported ? "yes" : "no")
+    }
              << std::endl;
  }
-  return 0;
+    return 0;
 }
--- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md
+++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md
@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.
 ## References (for more details)
--- a/Show More
+++ b/Show More
`@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d`
	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`

	`## References (for more details)`	`## References (for more details)`
`@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR`
	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`

	`## References (for more details)`	`## References (for more details)`
`@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute`
	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`	`Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.`

	`## References (for more details)`	`## References (for more details)`
`@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d`
	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`	`Make sure the dependencies mentioned in [Dependencies]() section above are installed.`

	`## References (for more details)`	`## References (for more details)`