Add and Update samples for CUDA 10.0

2025-07-29 09:10:30 +08:00 · 2018-08-24 22:35:15 +05:30 · 2018-08-24 22:35:15 +05:30 · 21c36d3568
commit 21c36d3568
parent 63e044cd0f
178 changed files with 12375 additions and 1288 deletions
--- a/Common/helper_cuda.h
+++ b/Common/helper_cuda.h
@ -51,457 +51,17 @@
 // CUDA Runtime error messages
 #ifdef __DRIVER_TYPES_H__
 static const char *_cudaGetErrorEnum(cudaError_t error) {
-  switch (error) {
-    case cudaSuccess:
-      return "cudaSuccess";
-
-    case cudaErrorMissingConfiguration:
-      return "cudaErrorMissingConfiguration";
-
-    case cudaErrorMemoryAllocation:
-      return "cudaErrorMemoryAllocation";
-
-    case cudaErrorInitializationError:
-      return "cudaErrorInitializationError";
-
-    case cudaErrorLaunchFailure:
-      return "cudaErrorLaunchFailure";
-
-    case cudaErrorPriorLaunchFailure:
-      return "cudaErrorPriorLaunchFailure";
-
-    case cudaErrorLaunchTimeout:
-      return "cudaErrorLaunchTimeout";
-
-    case cudaErrorLaunchOutOfResources:
-      return "cudaErrorLaunchOutOfResources";
-
-    case cudaErrorInvalidDeviceFunction:
-      return "cudaErrorInvalidDeviceFunction";
-
-    case cudaErrorInvalidConfiguration:
-      return "cudaErrorInvalidConfiguration";
-
-    case cudaErrorInvalidDevice:
-      return "cudaErrorInvalidDevice";
-
-    case cudaErrorInvalidValue:
-      return "cudaErrorInvalidValue";
-
-    case cudaErrorInvalidPitchValue:
-      return "cudaErrorInvalidPitchValue";
-
-    case cudaErrorInvalidSymbol:
-      return "cudaErrorInvalidSymbol";
-
-    case cudaErrorMapBufferObjectFailed:
-      return "cudaErrorMapBufferObjectFailed";
-
-    case cudaErrorUnmapBufferObjectFailed:
-      return "cudaErrorUnmapBufferObjectFailed";
-
-    case cudaErrorInvalidHostPointer:
-      return "cudaErrorInvalidHostPointer";
-
-    case cudaErrorInvalidDevicePointer:
-      return "cudaErrorInvalidDevicePointer";
-
-    case cudaErrorInvalidTexture:
-      return "cudaErrorInvalidTexture";
-
-    case cudaErrorInvalidTextureBinding:
-      return "cudaErrorInvalidTextureBinding";
-
-    case cudaErrorInvalidChannelDescriptor:
-      return "cudaErrorInvalidChannelDescriptor";
-
-    case cudaErrorInvalidMemcpyDirection:
-      return "cudaErrorInvalidMemcpyDirection";
-
-    case cudaErrorAddressOfConstant:
-      return "cudaErrorAddressOfConstant";
-
-    case cudaErrorTextureFetchFailed:
-      return "cudaErrorTextureFetchFailed";
-
-    case cudaErrorTextureNotBound:
-      return "cudaErrorTextureNotBound";
-
-    case cudaErrorSynchronizationError:
-      return "cudaErrorSynchronizationError";
-
-    case cudaErrorInvalidFilterSetting:
-      return "cudaErrorInvalidFilterSetting";
-
-    case cudaErrorInvalidNormSetting:
-      return "cudaErrorInvalidNormSetting";
-
-    case cudaErrorMixedDeviceExecution:
-      return "cudaErrorMixedDeviceExecution";
-
-    case cudaErrorCudartUnloading:
-      return "cudaErrorCudartUnloading";
-
-    case cudaErrorUnknown:
-      return "cudaErrorUnknown";
-
-    case cudaErrorNotYetImplemented:
-      return "cudaErrorNotYetImplemented";
-
-    case cudaErrorMemoryValueTooLarge:
-      return "cudaErrorMemoryValueTooLarge";
-
-    case cudaErrorInvalidResourceHandle:
-      return "cudaErrorInvalidResourceHandle";
-
-    case cudaErrorNotReady:
-      return "cudaErrorNotReady";
-
-    case cudaErrorInsufficientDriver:
-      return "cudaErrorInsufficientDriver";
-
-    case cudaErrorSetOnActiveProcess:
-      return "cudaErrorSetOnActiveProcess";
-
-    case cudaErrorInvalidSurface:
-      return "cudaErrorInvalidSurface";
-
-    case cudaErrorNoDevice:
-      return "cudaErrorNoDevice";
-
-    case cudaErrorECCUncorrectable:
-      return "cudaErrorECCUncorrectable";
-
-    case cudaErrorSharedObjectSymbolNotFound:
-      return "cudaErrorSharedObjectSymbolNotFound";
-
-    case cudaErrorSharedObjectInitFailed:
-      return "cudaErrorSharedObjectInitFailed";
-
-    case cudaErrorUnsupportedLimit:
-      return "cudaErrorUnsupportedLimit";
-
-    case cudaErrorDuplicateVariableName:
-      return "cudaErrorDuplicateVariableName";
-
-    case cudaErrorDuplicateTextureName:
-      return "cudaErrorDuplicateTextureName";
-
-    case cudaErrorDuplicateSurfaceName:
-      return "cudaErrorDuplicateSurfaceName";
-
-    case cudaErrorDevicesUnavailable:
-      return "cudaErrorDevicesUnavailable";
-
-    case cudaErrorInvalidKernelImage:
-      return "cudaErrorInvalidKernelImage";
-
-    case cudaErrorNoKernelImageForDevice:
-      return "cudaErrorNoKernelImageForDevice";
-
-    case cudaErrorIncompatibleDriverContext:
-      return "cudaErrorIncompatibleDriverContext";
-
-    case cudaErrorPeerAccessAlreadyEnabled:
-      return "cudaErrorPeerAccessAlreadyEnabled";
-
-    case cudaErrorPeerAccessNotEnabled:
-      return "cudaErrorPeerAccessNotEnabled";
-
-    case cudaErrorDeviceAlreadyInUse:
-      return "cudaErrorDeviceAlreadyInUse";
-
-    case cudaErrorProfilerDisabled:
-      return "cudaErrorProfilerDisabled";
-
-    case cudaErrorProfilerNotInitialized:
-      return "cudaErrorProfilerNotInitialized";
-
-    case cudaErrorProfilerAlreadyStarted:
-      return "cudaErrorProfilerAlreadyStarted";
-
-    case cudaErrorProfilerAlreadyStopped:
-      return "cudaErrorProfilerAlreadyStopped";
-
-    /* Since CUDA 4.0*/
-    case cudaErrorAssert:
-      return "cudaErrorAssert";
-
-    case cudaErrorTooManyPeers:
-      return "cudaErrorTooManyPeers";
-
-    case cudaErrorHostMemoryAlreadyRegistered:
-      return "cudaErrorHostMemoryAlreadyRegistered";
-
-    case cudaErrorHostMemoryNotRegistered:
-      return "cudaErrorHostMemoryNotRegistered";
-
-    /* Since CUDA 5.0 */
-    case cudaErrorOperatingSystem:
-      return "cudaErrorOperatingSystem";
-
-    case cudaErrorPeerAccessUnsupported:
-      return "cudaErrorPeerAccessUnsupported";
-
-    case cudaErrorLaunchMaxDepthExceeded:
-      return "cudaErrorLaunchMaxDepthExceeded";
-
-    case cudaErrorLaunchFileScopedTex:
-      return "cudaErrorLaunchFileScopedTex";
-
-    case cudaErrorLaunchFileScopedSurf:
-      return "cudaErrorLaunchFileScopedSurf";
-
-    case cudaErrorSyncDepthExceeded:
-      return "cudaErrorSyncDepthExceeded";
-
-    case cudaErrorLaunchPendingCountExceeded:
-      return "cudaErrorLaunchPendingCountExceeded";
-
-    case cudaErrorNotPermitted:
-      return "cudaErrorNotPermitted";
-
-    case cudaErrorNotSupported:
-      return "cudaErrorNotSupported";
-
-    /* Since CUDA 6.0 */
-    case cudaErrorHardwareStackError:
-      return "cudaErrorHardwareStackError";
-
-    case cudaErrorIllegalInstruction:
-      return "cudaErrorIllegalInstruction";
-
-    case cudaErrorMisalignedAddress:
-      return "cudaErrorMisalignedAddress";
-
-    case cudaErrorInvalidAddressSpace:
-      return "cudaErrorInvalidAddressSpace";
-
-    case cudaErrorInvalidPc:
-      return "cudaErrorInvalidPc";
-
-    case cudaErrorIllegalAddress:
-      return "cudaErrorIllegalAddress";
-
-    /* Since CUDA 6.5*/
-    case cudaErrorInvalidPtx:
-      return "cudaErrorInvalidPtx";
-
-    case cudaErrorInvalidGraphicsContext:
-      return "cudaErrorInvalidGraphicsContext";
-
-    case cudaErrorStartupFailure:
-      return "cudaErrorStartupFailure";
-
-    case cudaErrorApiFailureBase:
-      return "cudaErrorApiFailureBase";
-
-    /* Since CUDA 8.0*/
-    case cudaErrorNvlinkUncorrectable:
-      return "cudaErrorNvlinkUncorrectable";
-
-    /* Since CUDA 8.5*/
-    case cudaErrorJitCompilerNotFound:
-      return "cudaErrorJitCompilerNotFound";
-
-    /* Since CUDA 9.0*/
-    case cudaErrorCooperativeLaunchTooLarge:
-      return "cudaErrorCooperativeLaunchTooLarge";
-  }
-
-  return "<unknown>";
+  return cudaGetErrorName(error);
 }
 #endif

-#ifdef __cuda_cuda_h__
+#ifdef CUDA_DRIVER_API
 // CUDA Driver API errors
 static const char *_cudaGetErrorEnum(CUresult error) {
-  switch (error) {
-    case CUDA_SUCCESS:
-      return "CUDA_SUCCESS";
-
-    case CUDA_ERROR_INVALID_VALUE:
-      return "CUDA_ERROR_INVALID_VALUE";
-
-    case CUDA_ERROR_OUT_OF_MEMORY:
-      return "CUDA_ERROR_OUT_OF_MEMORY";
-
-    case CUDA_ERROR_NOT_INITIALIZED:
-      return "CUDA_ERROR_NOT_INITIALIZED";
-
-    case CUDA_ERROR_DEINITIALIZED:
-      return "CUDA_ERROR_DEINITIALIZED";
-
-    case CUDA_ERROR_PROFILER_DISABLED:
-      return "CUDA_ERROR_PROFILER_DISABLED";
-
-    case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
-      return "CUDA_ERROR_PROFILER_NOT_INITIALIZED";
-
-    case CUDA_ERROR_PROFILER_ALREADY_STARTED:
-      return "CUDA_ERROR_PROFILER_ALREADY_STARTED";
-
-    case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
-      return "CUDA_ERROR_PROFILER_ALREADY_STOPPED";
-
-    case CUDA_ERROR_NO_DEVICE:
-      return "CUDA_ERROR_NO_DEVICE";
-
-    case CUDA_ERROR_INVALID_DEVICE:
-      return "CUDA_ERROR_INVALID_DEVICE";
-
-    case CUDA_ERROR_INVALID_IMAGE:
-      return "CUDA_ERROR_INVALID_IMAGE";
-
-    case CUDA_ERROR_INVALID_CONTEXT:
-      return "CUDA_ERROR_INVALID_CONTEXT";
-
-    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-      return "CUDA_ERROR_CONTEXT_ALREADY_CURRENT";
-
-    case CUDA_ERROR_MAP_FAILED:
-      return "CUDA_ERROR_MAP_FAILED";
-
-    case CUDA_ERROR_UNMAP_FAILED:
-      return "CUDA_ERROR_UNMAP_FAILED";
-
-    case CUDA_ERROR_ARRAY_IS_MAPPED:
-      return "CUDA_ERROR_ARRAY_IS_MAPPED";
-
-    case CUDA_ERROR_ALREADY_MAPPED:
-      return "CUDA_ERROR_ALREADY_MAPPED";
-
-    case CUDA_ERROR_NO_BINARY_FOR_GPU:
-      return "CUDA_ERROR_NO_BINARY_FOR_GPU";
-
-    case CUDA_ERROR_ALREADY_ACQUIRED:
-      return "CUDA_ERROR_ALREADY_ACQUIRED";
-
-    case CUDA_ERROR_NOT_MAPPED:
-      return "CUDA_ERROR_NOT_MAPPED";
-
-    case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
-      return "CUDA_ERROR_NOT_MAPPED_AS_ARRAY";
-
-    case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
-      return "CUDA_ERROR_NOT_MAPPED_AS_POINTER";
-
-    case CUDA_ERROR_ECC_UNCORRECTABLE:
-      return "CUDA_ERROR_ECC_UNCORRECTABLE";
-
-    case CUDA_ERROR_UNSUPPORTED_LIMIT:
-      return "CUDA_ERROR_UNSUPPORTED_LIMIT";
-
-    case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
-      return "CUDA_ERROR_CONTEXT_ALREADY_IN_USE";
-
-    case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
-      return "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED";
-
-    case CUDA_ERROR_INVALID_PTX:
-      return "CUDA_ERROR_INVALID_PTX";
-
-    case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
-      return "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT";
-
-    case CUDA_ERROR_NVLINK_UNCORRECTABLE:
-      return "CUDA_ERROR_NVLINK_UNCORRECTABLE";
-
-    case CUDA_ERROR_JIT_COMPILER_NOT_FOUND:
-      return "CUDA_ERROR_JIT_COMPILER_NOT_FOUND";
-
-    case CUDA_ERROR_INVALID_SOURCE:
-      return "CUDA_ERROR_INVALID_SOURCE";
-
-    case CUDA_ERROR_FILE_NOT_FOUND:
-      return "CUDA_ERROR_FILE_NOT_FOUND";
-
-    case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
-      return "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND";
-
-    case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
-      return "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED";
-
-    case CUDA_ERROR_OPERATING_SYSTEM:
-      return "CUDA_ERROR_OPERATING_SYSTEM";
-
-    case CUDA_ERROR_INVALID_HANDLE:
-      return "CUDA_ERROR_INVALID_HANDLE";
-
-    case CUDA_ERROR_NOT_FOUND:
-      return "CUDA_ERROR_NOT_FOUND";
-
-    case CUDA_ERROR_NOT_READY:
-      return "CUDA_ERROR_NOT_READY";
-
-    case CUDA_ERROR_ILLEGAL_ADDRESS:
-      return "CUDA_ERROR_ILLEGAL_ADDRESS";
-
-    case CUDA_ERROR_LAUNCH_FAILED:
-      return "CUDA_ERROR_LAUNCH_FAILED";
-
-    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-
-    case CUDA_ERROR_LAUNCH_TIMEOUT:
-      return "CUDA_ERROR_LAUNCH_TIMEOUT";
-
-    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
-      return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
-
-    case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
-      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-
-    case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
-      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-
-    case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
-      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-
-    case CUDA_ERROR_CONTEXT_IS_DESTROYED:
-      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-
-    case CUDA_ERROR_ASSERT:
-      return "CUDA_ERROR_ASSERT";
-
-    case CUDA_ERROR_TOO_MANY_PEERS:
-      return "CUDA_ERROR_TOO_MANY_PEERS";
-
-    case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
-      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-
-    case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
-      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-
-    case CUDA_ERROR_HARDWARE_STACK_ERROR:
-      return "CUDA_ERROR_HARDWARE_STACK_ERROR";
-
-    case CUDA_ERROR_ILLEGAL_INSTRUCTION:
-      return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
-
-    case CUDA_ERROR_MISALIGNED_ADDRESS:
-      return "CUDA_ERROR_MISALIGNED_ADDRESS";
-
-    case CUDA_ERROR_INVALID_ADDRESS_SPACE:
-      return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
-
-    case CUDA_ERROR_INVALID_PC:
-      return "CUDA_ERROR_INVALID_PC";
-
-    case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
-      return "CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE";
-
-    case CUDA_ERROR_NOT_PERMITTED:
-      return "CUDA_ERROR_NOT_PERMITTED";
-
-    case CUDA_ERROR_NOT_SUPPORTED:
-      return "CUDA_ERROR_NOT_SUPPORTED";
-
-    case CUDA_ERROR_UNKNOWN:
-      return "CUDA_ERROR_UNKNOWN";
-  }
-
-  return "<unknown>";
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
 }
 #endif

@ -1067,18 +627,19 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
  } sSMtoCores;

  sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x30, 192},  // Kepler Generation (SM 3.0) GK10x class
-      {0x32, 192},  // Kepler Generation (SM 3.2) GK10x class
-      {0x35, 192},  // Kepler Generation (SM 3.5) GK11x class
-      {0x37, 192},  // Kepler Generation (SM 3.7) GK21x class
-      {0x50, 128},  // Maxwell Generation (SM 5.0) GM10x class
-      {0x52, 128},  // Maxwell Generation (SM 5.2) GM20x class
-      {0x53, 128},  // Maxwell Generation (SM 5.3) GM20x class
-      {0x60, 64},   // Pascal Generation (SM 6.0) GP100 class
-      {0x61, 128},  // Pascal Generation (SM 6.1) GP10x class
-      {0x62, 128},  // Pascal Generation (SM 6.2) GP10x class
-      {0x70, 64},   // Volta Generation (SM 7.0) GV100 class
-      {0x72, 64},   // Volta Generation (SM 7.2) GV11b class
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
      {-1, -1}};

  int index = 0;
@ -1155,7 +716,7 @@ inline int gpuDeviceInit(int devID) {
 inline int gpuGetMaxGflopsDeviceId() {
  int current_device = 0, sm_per_multiproc = 0;
  int max_perf_device = 0;
-  int device_count = 0, best_SM_arch = 0;
+  int device_count = 0;
  int devices_prohibited = 0;

  uint64_t max_compute_perf = 0;
@ -1169,30 +730,6 @@ inline int gpuGetMaxGflopsDeviceId() {
    exit(EXIT_FAILURE);
  }

-  // Find the best major SM Architecture GPU device
-  while (current_device < device_count) {
-    cudaGetDeviceProperties(&deviceProp, current_device);
-
-    // If this GPU is not running on Compute Mode prohibited,
-    // then we can add it to the list
-    if (deviceProp.computeMode != cudaComputeModeProhibited) {
-      if (deviceProp.major > 0 && deviceProp.major < 9999) {
-        best_SM_arch = MAX(best_SM_arch, deviceProp.major);
-      }
-    } else {
-      devices_prohibited++;
-    }
-
-    current_device++;
-  }
-
-  if (devices_prohibited == device_count) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceId() CUDA error:"
-            " all devices have compute mode prohibited.\n");
-    exit(EXIT_FAILURE);
-  }
-
  // Find the best CUDA capable GPU device
  current_device = 0;

@ -1213,23 +750,23 @@ inline int gpuGetMaxGflopsDeviceId() {
                              sm_per_multiproc * deviceProp.clockRate;

      if (compute_perf > max_compute_perf) {
-        // If we find GPU with SM major > 2, search only these
-        if (best_SM_arch > 2) {
-          // If our device==dest_SM_arch, choose this, or else pass
-          if (deviceProp.major == best_SM_arch) {
-            max_compute_perf = compute_perf;
-            max_perf_device = current_device;
-          }
-        } else {
-          max_compute_perf = compute_perf;
-          max_perf_device = current_device;
-        }
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
      }
+    } else {
+      devices_prohibited++;
    }

    ++current_device;
  }

+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
  return max_perf_device;
 }

--- a/Common/helper_cuda_drvapi.h
+++ b/Common/helper_cuda_drvapi.h
@ -122,18 +122,19 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
  } sSMtoCores;

  sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x30, 192},  // Kepler Generation (SM 3.0) GK10x class
-      {0x32, 192},  // Kepler Generation (SM 3.2) GK10x class
-      {0x35, 192},  // Kepler Generation (SM 3.5) GK11x class
-      {0x37, 192},  // Kepler Generation (SM 3.7) GK21x class
-      {0x50, 128},  // Maxwell Generation (SM 5.0) GM10x class
-      {0x52, 128},  // Maxwell Generation (SM 5.2) GM20x class
-      {0x53, 128},  // Maxwell Generation (SM 5.3) GM20x class
-      {0x60, 64},   // Pascal Generation (SM 6.0) GP100 class
-      {0x61, 128},  // Pascal Generation (SM 6.1) GP10x class
-      {0x62, 128},  // Pascal Generation (SM 6.2) GP10x class
-      {0x70, 64},   // Volta Generation (SM 7.0) GV100 class
-      {0x72, 64},   // Volta Generation (SM 7.2) GV11b class
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
      {-1, -1}};

  int index = 0;
--- a/README.md
+++ b/README.md
@ -1,11 +1,22 @@
 # CUDA Samples

-Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads).
+Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads).

 ## Release Notes

 This section describes the release notes for the CUDA Samples on GitHub only.

+### CUDA 10.0
+*  Added `simpleCudaGraphs`. Demonstrates CUDA Graphs creation, instantiation and launch using Graphs APIs and Stream Capture APIs.
+*  Added `conjugateGradientCudaGraphs`. Demonstrates conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs.
+*  Added `simpleVulkan`. Demonstrates Vulkan - CUDA Interop.
+*  Added `UnifiedMemoryPerf`. Demonstrates performance comparision of various memory types involved in system.
+*  Added `p2pBandwidthLatencyTest`. Demonstrates Peer-To-Peer (P2P) data transfers between pairs of GPUs and computes latency and bandwidth.
+*  Added `systemWideAtomics`. Demonstrates system wide atomic instructions.
+*  Added `simpleCUBLASXT`. Demonstrates CUBLAS-XT library which performs GEMM operations over multiple GPUs.
+*  Added Windows OS support to `conjugateGradientMultiDeviceCG` sample.
+*  Removed support of Visual Studio 2010 from all samples.
+
 ### CUDA 9.2

 This is the first release of CUDA Samples on GitHub:
@ -26,7 +37,7 @@ This is the first release of CUDA Samples on GitHub:

 ### Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html), and the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html).

 ### Getting the CUDA Samples
@ -108,22 +119,27 @@ The samples makefiles can take advantage of certain options:
 ### Samples by OS

 #### Linux
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[deviceQuery](./Samples/deviceQuery)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
 ---|---|---|---|
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
-**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[matrixMul](./Samples/matrixMul)** |
+**[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** |
+**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
+**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** | **[systemWideAtomics](./Samples/systemWideAtomics)** |

 #### Windows
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[deviceQuery](./Samples/deviceQuery)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[conjugateGradientMultiBlockCG](./Samples/conjugateGradientMultiBlockCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** |
 ---|---|---|---|
-**[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** |
-**[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** | **[matrixMul](./Samples/matrixMul)** |
+**[simpleCudaGraphs](./Samples/simpleCudaGraphs)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** |
+**[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** | **[conjugateGradientMultiDeviceCG](./Samples/conjugateGradientMultiDeviceCG)** | **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** |
+**[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[cudaTensorCoreGemm](./Samples/cudaTensorCoreGemm)** | **[simpleVulkan](./Samples/simpleVulkan)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |

 #### Mac OSX
-**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** |
+**[warpAggregatedAtomicsCG](./Samples/warpAggregatedAtomicsCG)** | **[shfl_scan](./Samples/shfl_scan)** | **[conjugateGradientCudaGraphs](./Samples/conjugateGradientCudaGraphs)** | **[simpleCudaGraphs](./Samples/simpleCudaGraphs)** |
 ---|---|---|---|
+**[deviceQuery](./Samples/deviceQuery)** | **[simpleVoteIntrinsics](./Samples/simpleVoteIntrinsics)** | **[simpleCUBLASXT](./Samples/simpleCUBLASXT)** | **[UnifiedMemoryPerf](./Samples/UnifiedMemoryPerf)** |
 **[matrixMulDrv](./Samples/matrixMulDrv)** | **[simpleCUFFT](./Samples/simpleCUFFT)** | **[simpleCUBLAS](./Samples/simpleCUBLAS)** | **[vectorAdd_nvrtc](./Samples/vectorAdd_nvrtc)** |
-**[matrixMul](./Samples/matrixMul)** |
+**[p2pBandwidthLatencyTest](./Samples/p2pBandwidthLatencyTest)** | **[matrixMul](./Samples/matrixMul)** |

 ## Dependencies

@ -161,6 +177,10 @@ OpenGL is a graphics library used for 2D and 3D rendering. On systems which supp

 OpenGL ES is an embedded systems graphics library used for 2D and 3D rendering. On systems which support OpenGL ES, NVIDIA's OpenGL ES implementation is provided with the CUDA Driver.

+#### Vulkan
+
+Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan targets high-performance realtime 3D graphics applications such as video games and interactive media across all platforms. On systems which support Vulkan, NVIDIA's Vulkan implementation is provided with the CUDA Driver. For building and running Vulkan applications one needs to install the [Vulkan SDK](https://www.lunarg.com/vulkan-sdk/).
+
 #### OpenMP

 OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/).
--- a/Samples/UnifiedMemoryPerf/Makefile
+++ b/Samples/UnifiedMemoryPerf/Makefile
@ -0,0 +1,306 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+SMS ?= 30 35 37 50 52 60 61 70 75
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: UnifiedMemoryPerf
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+commonKernels.o:commonKernels.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+helperFunctions.o:helperFunctions.cpp
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+matrixMultiplyPerf.o:matrixMultiplyPerf.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+UnifiedMemoryPerf: commonKernels.o helperFunctions.o matrixMultiplyPerf.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./UnifiedMemoryPerf
+
+clean:
+	rm -f UnifiedMemoryPerf commonKernels.o helperFunctions.o matrixMultiplyPerf.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/UnifiedMemoryPerf
+
+clobber: clean
--- a/Samples/UnifiedMemoryPerf/NsightEclipse.xml
+++ b/Samples/UnifiedMemoryPerf/NsightEclipse.xml
@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>UnifiedMemoryPerf</name>
+  <cuda_api_list>
+    <toolkit>cudaMallocManaged</toolkit>
+    <toolkit>cudaStreamAttachMemAsync</toolkit>
+    <toolkit>cudaMemcpyAsync</toolkit>
+    <toolkit>cudaMallocHost</toolkit>
+    <toolkit>cudaMalloc</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample demonstrates the performance comparision using matrix multiplication kernel of Unified Memory with/without hints and other types of memory like zero copy buffers, pageable, pagelocked memory performing synchronous and Asynchronous transfers on a single GPU.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">CUDA Systems Integration</concept>
+    <concept level="basic">Unified Memory</concept>
+    <concept level="basic">CUDA Streams and Events</concept>
+    <concept level="basic">Pinned System Paged Memory</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>Unified Memory</keyword>
+    <keyword>Pinned Memory</keyword>
+    <keyword>Zero copy buffer</keyword>
+    <keyword>UVM</keyword>
+    <keyword>Streams</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>matrixMultiplyPerf.cu</primary_file>
+  <required_dependencies>
+    <dependency>UVM</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:CUDA Systems Integration</scope>
+    <scope>1:Unified Memory</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>aarch64</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <from>3.0</from>
+  </supported_sm_architectures>
+  <title>Unified and other CUDA Memories Performance</title>
+  <type>exe</type>
+</entry>
--- a/Samples/UnifiedMemoryPerf/README.md
+++ b/Samples/UnifiedMemoryPerf/README.md
@ -0,0 +1,98 @@
+# UnifiedMemoryPerf - Unified and other CUDA Memories Performance
+
+## Description
+
+This sample demonstrates the performance comparision using matrix multiplication kernel of Unified Memory with/without hints and other types of memory like zero copy buffers, pageable, pagelocked memory performing synchronous and Asynchronous transfers on a single GPU.
+
+## Key Concepts
+
+CUDA Systems Integration, Unified Memory, CUDA Streams and Events, Pinned System Paged Memory
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l, aarch64
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaMallocManaged, cudaStreamAttachMemAsync, cudaMemcpyAsync, cudaMallocHost, cudaMalloc
+
+## Dependencies needed to build/run
+[UVM](../../README.md#uvm)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.sln
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2012.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryPerf_vs2012</RootNamespace>
+    <ProjectName>UnifiedMemoryPerf</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="commonKernels.cu" />
+    <ClCompile Include="helperFunctions.cpp" />
+    <CudaCompile Include="matrixMultiplyPerf.cu" />
+    <ClInclude Include="commonDefs.hpp" />
+    <ClInclude Include="commonKernels.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.sln
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2013.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryPerf_vs2013</RootNamespace>
+    <ProjectName>UnifiedMemoryPerf</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="commonKernels.cu" />
+    <ClCompile Include="helperFunctions.cpp" />
+    <CudaCompile Include="matrixMultiplyPerf.cu" />
+    <ClInclude Include="commonDefs.hpp" />
+    <ClInclude Include="commonKernels.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.sln
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2015.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryPerf_vs2015</RootNamespace>
+    <ProjectName>UnifiedMemoryPerf</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="commonKernels.cu" />
+    <ClCompile Include="helperFunctions.cpp" />
+    <CudaCompile Include="matrixMultiplyPerf.cu" />
+    <ClInclude Include="commonDefs.hpp" />
+    <ClInclude Include="commonKernels.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.sln
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryPerf", "UnifiedMemoryPerf_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
+++ b/Samples/UnifiedMemoryPerf/UnifiedMemoryPerf_vs2017.vcxproj
@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>UnifiedMemoryPerf_vs2017</RootNamespace>
+    <ProjectName>UnifiedMemoryPerf</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/UnifiedMemoryPerf.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="commonKernels.cu" />
+    <ClCompile Include="helperFunctions.cpp" />
+    <CudaCompile Include="matrixMultiplyPerf.cu" />
+    <ClInclude Include="commonDefs.hpp" />
+    <ClInclude Include="commonKernels.hpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/UnifiedMemoryPerf/commonDefs.hpp
+++ b/Samples/UnifiedMemoryPerf/commonDefs.hpp
@ -0,0 +1,88 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _COMMON_DEFS_
+#define _COMMON_DEFS_
+#include <cuda.h>
+
+#define ONE_KB 1024
+#define ONE_MB (ONE_KB * ONE_KB)
+
+extern size_t maxSampleSizeInMb;
+extern int numKernelRuns;
+extern int verboseResults;
+
+extern unsigned int findNumSizesToTest(unsigned int minSize,
+                                       unsigned int maxSize,
+                                       unsigned int multiplier);
+
+// For Tracking the different memory allocation types
+typedef enum memAllocType_enum {
+  MEMALLOC_TYPE_START,
+  USE_MANAGED_MEMORY_WITH_HINTS = MEMALLOC_TYPE_START,
+  USE_MANAGED_MEMORY_WITH_HINTS_ASYNC,
+  USE_MANAGED_MEMORY,
+  USE_ZERO_COPY,
+  USE_HOST_PAGEABLE_AND_DEVICE_MEMORY,
+  USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC,
+  USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY,
+  USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC,
+  MEMALLOC_TYPE_END = USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC,
+  MEMALLOC_TYPE_INVALID,
+  MEMALLOC_TYPE_COUNT = MEMALLOC_TYPE_INVALID
+} MemAllocType;
+
+typedef enum bandwidthType_enum {
+  READ_BANDWIDTH,
+  WRITE_BANDWIDTH
+} BandwidthType;
+
+extern const char *memAllocTypeStr[];
+extern const char *memAllocTypeShortStr[];
+
+struct resultsData;
+struct testResults;
+
+void createAndInitTestResults(struct testResults **results,
+                              const char *testName,
+                              unsigned int numMeasurements,
+                              unsigned int numSizesToTest);
+unsigned long *getPtrSizesToTest(struct testResults *results);
+
+void freeTestResultsAndAllResultsData(struct testResults *results);
+
+void createResultDataAndAddToTestResults(struct resultsData **ptrData,
+                                         struct testResults *results,
+                                         const char *resultsName,
+                                         bool printOnlyInVerbose,
+                                         bool reportAsBandwidth);
+double *getPtrRunTimesInMs(struct resultsData *data, int allocType,
+                           int sizeIndex);
+
+void printResults(struct testResults *results,
+                  bool print_launch_transfer_results, bool print_std_deviation);
+#endif
--- a/Samples/UnifiedMemoryPerf/commonKernels.cu
+++ b/Samples/UnifiedMemoryPerf/commonKernels.cu
@ -0,0 +1,33 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "commonKernels.hpp"
+
+__global__ void spinWhileLessThanOne(volatile unsigned int *latch) {
+  while (latch[0] < 1)
+    ;
+}
--- a/Samples/UnifiedMemoryPerf/commonKernels.hpp
+++ b/Samples/UnifiedMemoryPerf/commonKernels.hpp
@ -0,0 +1,28 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+__global__ void spinWhileLessThanOne(volatile unsigned int *latch);
--- a/Samples/UnifiedMemoryPerf/helperFunctions.cpp
+++ b/Samples/UnifiedMemoryPerf/helperFunctions.cpp
@ -0,0 +1,303 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "commonDefs.hpp"
+#define CU_INIT_UUID
+#include <cmath>
+
+#define UNITS_Time "ms"
+#define UNITS_BW "MB/s"
+#define KB_str "KB"
+#define MB_str "MB"
+
+struct resultsData {
+  char resultsName[64];
+  struct testResults *results;
+  // this has MEMALLOC_TYPE_COUNT * results->numSizesToTest *
+  // results->numMeasurements elements
+  double **runTimesInMs[MEMALLOC_TYPE_COUNT];
+  double *averageRunTimesInMs[MEMALLOC_TYPE_COUNT];
+  double *stdDevRunTimesInMs[MEMALLOC_TYPE_COUNT];
+  double *stdDevBandwidthInMBps[MEMALLOC_TYPE_COUNT];
+  bool printOnlyInVerbose;
+  bool reportAsBandwidth;
+  struct resultsData *next;
+};
+
+struct testResults {
+  char testName[64];
+  unsigned int numMeasurements;
+  unsigned long *sizesToTest;
+  unsigned int numSizesToTest;
+  struct resultsData *resultsDataHead;
+  struct resultsData *resultsDataTail;
+};
+
+unsigned int findNumSizesToTest(unsigned int minSize, unsigned int maxSize,
+                                unsigned int multiplier) {
+  unsigned int numSizesToTest = 0;
+  while (minSize <= maxSize) {
+    numSizesToTest++;
+    minSize *= multiplier;
+  }
+  return numSizesToTest;
+}
+
+int compareDoubles(const void *ptr1, const void *ptr2) {
+  return (*(double *)ptr1 > *(double *)ptr2) ? 1 : -1;
+}
+
+static inline double getTimeOrBandwidth(double runTimeInMs, unsigned long size,
+                                        bool getBandwidth) {
+  return (getBandwidth) ? (1000 * (size / runTimeInMs)) / ONE_MB : runTimeInMs;
+}
+
+void createAndInitTestResults(struct testResults **ptrResults,
+                              const char *testName,
+                              unsigned int numMeasurements,
+                              unsigned int numSizesToTest) {
+  unsigned int i;
+  struct testResults *results;
+  results = (struct testResults *)malloc(sizeof(struct testResults));
+  memset(results, 0, sizeof(struct testResults));
+  strcpy(results->testName, testName);
+  results->numMeasurements = numMeasurements;
+  results->numSizesToTest = numSizesToTest;
+  results->sizesToTest =
+      (unsigned long *)malloc(numSizesToTest * sizeof(unsigned long));
+  results->resultsDataHead = NULL;
+  results->resultsDataTail = NULL;
+
+  *ptrResults = results;
+}
+
+unsigned long *getPtrSizesToTest(struct testResults *results) {
+  return results->sizesToTest;
+}
+
+void createResultDataAndAddToTestResults(struct resultsData **ptrData,
+                                         struct testResults *results,
+                                         const char *resultsName,
+                                         bool printOnlyInVerbose,
+                                         bool reportAsBandwidth) {
+  unsigned int i, j;
+  struct resultsData *data;
+  data = (struct resultsData *)malloc(sizeof(struct resultsData));
+  memset(data, 0, sizeof(struct resultsData));
+  strcpy(data->resultsName, resultsName);
+  data->results = results;
+  for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) {
+    data->runTimesInMs[i] =
+        (double **)malloc(results->numSizesToTest * sizeof(double *));
+    for (j = 0; j < results->numSizesToTest; j++) {
+      data->runTimesInMs[i][j] =
+          (double *)malloc(results->numMeasurements * sizeof(double));
+    }
+    data->averageRunTimesInMs[i] =
+        (double *)malloc(results->numSizesToTest * sizeof(double));
+    data->stdDevRunTimesInMs[i] =
+        (double *)malloc(results->numSizesToTest * sizeof(double));
+    data->stdDevBandwidthInMBps[i] =
+        (double *)malloc(results->numSizesToTest * sizeof(double));
+  }
+  data->printOnlyInVerbose = printOnlyInVerbose;
+  data->reportAsBandwidth = reportAsBandwidth;
+  data->next = NULL;
+  *ptrData = data;
+  if (results->resultsDataHead == NULL) {
+    results->resultsDataHead = data;
+    results->resultsDataTail = data;
+  } else {
+    results->resultsDataTail->next = data;
+    results->resultsDataTail = data;
+  }
+}
+
+double *getPtrRunTimesInMs(struct resultsData *data, int allocType,
+                           int sizeIndex) {
+  return data->runTimesInMs[allocType][sizeIndex];
+}
+
+void freeTestResultsAndAllResultsData(struct testResults *results) {
+  struct resultsData *data, *dataToFree;
+  unsigned int i, j;
+  for (data = results->resultsDataHead; data != NULL;) {
+    for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) {
+      for (j = 0; j < results->numSizesToTest; j++) {
+        free(data->runTimesInMs[i][j]);
+      }
+      free(data->runTimesInMs[i]);
+      free(data->averageRunTimesInMs[i]);
+      free(data->stdDevRunTimesInMs[i]);
+      free(data->stdDevBandwidthInMBps[i]);
+    }
+    dataToFree = data;
+    data = data->next;
+    free(dataToFree);
+  }
+  free(results->sizesToTest);
+  free(results);
+}
+
+void calculateAverageAndStdDev(double *pAverage, double *pStdDev,
+                               double *allResults, unsigned int count) {
+  unsigned int i;
+  double average = 0.0;
+  double stdDev = 0.0;
+  for (i = 0; i < count; i++) {
+    average += allResults[i];
+  }
+  average /= count;
+  for (i = 0; i < count; i++) {
+    stdDev += (allResults[i] - average) * (allResults[i] - average);
+  }
+  stdDev /= count;
+  stdDev = sqrt(stdDev);
+  *pAverage = average;
+  *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average);
+}
+
+void calculateStdDevBandwidth(double *pStdDev, double *allResults,
+                              unsigned int count, unsigned long size) {
+  unsigned int i;
+  double bandwidth;
+  double average = 0.0;
+  double stdDev = 0.0;
+  for (i = 0; i < count; i++) {
+    bandwidth = (1000 * (size / allResults[i])) / ONE_MB;
+    average += bandwidth;
+  }
+  average /= count;
+  for (i = 0; i < count; i++) {
+    bandwidth = (1000 * (size / allResults[i])) / ONE_MB;
+    stdDev += (bandwidth - average) * (bandwidth - average);
+  }
+  stdDev /= count;
+  stdDev = sqrt(stdDev);
+  *pStdDev = (average == 0.0) ? 0.0 : ((100.0 * stdDev) / average);
+}
+
+void printTimesInTableFormat(struct testResults *results,
+                             struct resultsData *data, bool printAverage,
+                             bool printStdDev) {
+  unsigned int i, j;
+  bool printStdDevBandwidth = printStdDev && data->reportAsBandwidth;
+  printf("Size_KB");
+  for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) {
+    printf("\t%7s", memAllocTypeShortStr[i]);
+  }
+  printf("\n");
+  for (j = 0; j < results->numSizesToTest; j++) {
+    printf("%lu", results->sizesToTest[j] / ONE_KB);
+    for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) {
+      printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf",
+             printStdDevBandwidth
+                 ? data->stdDevBandwidthInMBps[i][j]
+                 : getTimeOrBandwidth(
+                       printAverage ? data->averageRunTimesInMs[i][j]
+                                    : data->stdDevRunTimesInMs[i][j],
+                       results->sizesToTest[j], data->reportAsBandwidth));
+    }
+    printf("\n");
+  }
+}
+
+void printAllResultsInVerboseMode(struct testResults *results,
+                                  struct resultsData *data) {
+  unsigned int i, j, k;
+  for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) {
+    printf("Verbose mode, printing all results for %s\n", memAllocTypeStr[i]);
+    printf("Instance");
+    for (j = 0; j < results->numSizesToTest; j++) {
+      printf("\t%lu", results->sizesToTest[j] / ONE_KB);
+    }
+    printf("\n");
+    for (k = 0; k < results->numMeasurements; k++) {
+      printf("%u", k);
+      for (j = 0; j < results->numSizesToTest; j++) {
+        printf(data->reportAsBandwidth ? "\t%7.2lf" : "\t%7.3lf",
+               getTimeOrBandwidth(data->runTimesInMs[i][j][k],
+                                  results->sizesToTest[j],
+                                  data->reportAsBandwidth));
+      }
+      printf("\n");
+    }
+  }
+}
+
+void printResults(struct testResults *results,
+                  bool print_launch_transfer_results,
+                  bool print_std_deviation) {
+  char vulcanPrint[256];
+  char resultNameNoSpaces[64];
+  unsigned int i, j, k;
+  struct resultsData *resultsIter;
+  bool sizeGreaterThan1MB;
+  for (resultsIter = results->resultsDataHead; resultsIter != NULL;
+       resultsIter = resultsIter->next) {
+    if (!verboseResults && resultsIter->printOnlyInVerbose) {
+      continue;
+    }
+    if (!print_launch_transfer_results) {
+      if (!(strcmp(resultsIter->resultsName, "Overall Time") == 0)) {
+        continue;
+      }
+    }
+    // regular print
+    printf("\n%s For %s ", resultsIter->resultsName, results->testName);
+    printf("\n");
+    for (j = 0; j < results->numSizesToTest; j++) {
+      for (i = 0; i < MEMALLOC_TYPE_COUNT; i++) {
+        calculateAverageAndStdDev(&resultsIter->averageRunTimesInMs[i][j],
+                                  &resultsIter->stdDevRunTimesInMs[i][j],
+                                  resultsIter->runTimesInMs[i][j],
+                                  results->numMeasurements);
+        if (resultsIter->reportAsBandwidth) {
+          calculateStdDevBandwidth(&resultsIter->stdDevBandwidthInMBps[i][j],
+                                   resultsIter->runTimesInMs[i][j],
+                                   results->numMeasurements,
+                                   results->sizesToTest[j]);
+        }
+      }
+    }
+    printf("\nPrinting Average of %u measurements in (%s)\n",
+           results->numMeasurements,
+           resultsIter->reportAsBandwidth ? UNITS_BW : UNITS_Time);
+    printTimesInTableFormat(results, resultsIter, true, false);
+    if (print_std_deviation) {
+      printf(
+          "\nPrinting Standard Deviation as %% of average of %u measurements\n",
+          results->numMeasurements);
+      printTimesInTableFormat(results, resultsIter, false, true);
+    }
+    if (verboseResults) {
+      printAllResultsInVerboseMode(results, resultsIter);
+    }
+  }
+}
--- a/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu
+++ b/Samples/UnifiedMemoryPerf/matrixMultiplyPerf.cu
@ -0,0 +1,697 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <helper_cuda.h>
+#include <helper_timer.h>
+#include "commonDefs.hpp"
+#include "commonKernels.hpp"
+
+#define VERIFY_GPU_CORRECTNESS 0
+
+size_t maxSampleSizeInMb = 64;
+int numKernelRuns = 100;
+int verboseResults = 0;
+
+const char *memAllocTypeStr[MEMALLOC_TYPE_COUNT] = {
+    "Managed_Memory_With_Hints",
+    "Managed_Memory_With_Hints_FullyAsync",
+    "Managed_Memory_NoHints",
+    "Zero_Copy",
+    "Memcpy_HostMalloc_DeviceCudaMalloc",
+    "MemcpyAsync_HostMalloc_DeviceCudaMalloc",
+    "Memcpy_HostCudaHostAlloc_DeviceCudaMalloc",
+    "MemcpyAsync_HostCudaHostAlloc_DeviceCudaMalloc"};
+
+const char *memAllocTypeShortStr[MEMALLOC_TYPE_COUNT] = {
+    "UMhint",   // Managed Memory With Hints
+    "UMhntAs",  // Managed Memory With_Hints Async
+    "UMeasy",   // Managed_Memory with No Hints
+    "0Copy",    // Zero Copy
+    "MemCopy",  // USE HOST PAGEABLE AND DEVICE_MEMORY
+    "CpAsync",  // USE HOST PAGEABLE AND DEVICE_MEMORY ASYNC
+    "CpHpglk",  // USE HOST PAGELOCKED AND DEVICE MEMORY
+    "CpPglAs"   // USE HOST PAGELOCKED AND DEVICE MEMORY ASYNC
+};
+
+static float RandFloat(float low, float high) {
+  float t = (float)rand() / (float)RAND_MAX;
+  return (1.0f - t) * low + t * high;
+}
+
+void fillMatrixWithRandomValues(float *matrix, unsigned int matrixDim) {
+  unsigned int i, j;
+  for (i = 0; i < matrixDim; ++i) {
+    for (j = 0; j < matrixDim; ++j) {
+      matrix[j + i * matrixDim] = RandFloat(0.0f, 10.0f);
+    }
+  }
+}
+
+#if VERIFY_GPU_CORRECTNESS
+void verifyMatrixMultiplyCorrectness(float *C, float *A, float *B,
+                                     unsigned int matrixDim) {
+  unsigned int i, j, k, numErrors = 0;
+  for (i = 0; i < matrixDim; ++i) {
+    for (j = 0; j < matrixDim; ++j) {
+      float result = 0.0f;
+      for (k = 0; k < matrixDim; ++k) {
+        result += A[k + i * matrixDim] * B[j + k * matrixDim];
+      }
+      if (fabs(C[j + i * matrixDim] - result) > 0.001 * matrixDim) {
+        printf("At [%u, %u]: Expected %f, Found %f\n", i, j, result,
+               C[j + i * matrixDim]);
+        ++numErrors;
+      }
+    }
+  }
+  if (numErrors != 0) {
+    printf("%d value mismatches occured\n", numErrors);
+    fflush(stdout);
+    exit(EXIT_FAILURE);  // exit since value mismatches occured
+  }
+}
+#endif
+
+void copyMatrix(float *dstMatrix, float *srcMatrix, unsigned int matrixDim) {
+  size_t size = matrixDim * matrixDim * sizeof(float);
+  memcpy(dstMatrix, srcMatrix, size);
+}
+
+void verifyMatrixData(float *expectedData, float *observedData,
+                      unsigned int matrixDim) {
+  unsigned int i, j, numErrors = 0;
+  for (i = 0; i < matrixDim; ++i) {
+    for (j = 0; j < matrixDim; ++j) {
+      if (expectedData[j + i * matrixDim] != observedData[j + i * matrixDim]) {
+        ++numErrors;
+        if (verboseResults) {
+          printf("At [%u, %u]: Expected %f, Found %f\n", i, j,
+                 expectedData[j + i * matrixDim],
+                 observedData[j + i * matrixDim]);
+        }
+      }
+    }
+  }
+  if (numErrors != 0) {
+    printf("%d value mismatches occured\n", numErrors);
+    fflush(stdout);
+    exit(EXIT_FAILURE);  // exit since value mismatches occured
+  }
+}
+
+#define BLOCK_SIZE 32
+__global__ void matrixMultiplyKernel(float *C, float *A, float *B,
+                                     unsigned int matrixDim) {
+  // Block index
+  int bx = blockIdx.x;
+  int by = blockIdx.y;
+
+  // Thread index
+  int tx = threadIdx.x;
+  int ty = threadIdx.y;
+
+  unsigned int wA = matrixDim;
+  unsigned int wB = matrixDim;
+
+  // Index of the first sub-matrix of A processed by the block
+  int aBegin = matrixDim * BLOCK_SIZE * by;
+
+  // Index of the last sub-matrix of A processed by the block
+  int aEnd = aBegin + wA - 1;
+
+  // Step size used to iterate through the sub-matrices of A
+  int aStep = BLOCK_SIZE;
+
+  // Index of the first sub-matrix of B processed by the block
+  int bBegin = BLOCK_SIZE * bx;
+
+  // Step size used to iterate through the sub-matrices of B
+  int bStep = BLOCK_SIZE * wB;
+
+  // Csub is used to store the element of the block sub-matrix
+  // that is computed by the thread
+  float Csub = 0;
+
+  // Loop over all the sub-matrices of A and B
+  // required to compute the block sub-matrix
+  for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+    // Load the matrices from device memory
+    // to shared memory; each thread loads
+    // one element of each matrix
+    As[ty][tx] = A[a + wA * ty + tx];
+    Bs[ty][tx] = B[b + wB * ty + tx];
+
+    // Synchronize to make sure the matrices are loaded
+    __syncthreads();
+
+    // Multiply the two matrices together;
+    // each thread computes one element
+    // of the block sub-matrix
+#pragma unroll
+
+    for (int k = 0; k < BLOCK_SIZE; ++k) {
+      Csub += As[ty][k] * Bs[k][tx];
+    }
+
+    // Synchronize to make sure that the preceding
+    // computation is done before loading two new
+    // sub-matrices of A and B in the next iteration
+    __syncthreads();
+  }
+
+  // Write the block sub-matrix to device memory;
+  // each thread writes one element
+  int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
+  C[c + wB * ty + tx] = Csub;
+}
+
+void runMatrixMultiplyKernel(unsigned int matrixDim, int allocType,
+                             unsigned int numLoops, double *gpuLaunchCallsTimes,
+                             double *gpuTransferToCallsTimes,
+                             double *gpuTransferFromCallsTimes,
+                             double *gpuLaunchAndTransferCallsTimes,
+                             double *gpuLaunchTransferSyncTimes,
+                             double *cpuAccessTimes, double *overallTimes,
+                             int device_id) {
+  float *dptrA = NULL, *hptrA = NULL;
+  float *dptrB = NULL, *hptrB = NULL;
+  float *dptrC = NULL, *hptrC = NULL;
+  float *randValuesX = NULL, *randValuesY = NULL;
+  float *randValuesVerifyXmulY = NULL, *randValuesVerifyYmulX = NULL;
+  bool copyRequired = false, hintsRequired = false;
+  bool someTransferOpRequired;
+  bool isAsync = false;
+  cudaStream_t streamToRunOn;
+  unsigned int *latch;
+  size_t size = matrixDim * matrixDim * sizeof(float);
+  dim3 threads(32, 32);
+  dim3 grid(matrixDim / threads.x, matrixDim / threads.y);
+  StopWatchInterface *gpuLaunchCallsTimer = 0, *gpuTransferCallsTimer = 0;
+  StopWatchInterface *gpuSyncTimer = 0, *cpuAccessTimer = 0;
+  sdkCreateTimer(&gpuLaunchCallsTimer);
+  sdkCreateTimer(&gpuTransferCallsTimer);
+  sdkCreateTimer(&gpuSyncTimer);
+  sdkCreateTimer(&cpuAccessTimer);
+  unsigned int i;
+
+  cudaDeviceProp deviceProp;
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device_id));
+  checkCudaErrors(cudaStreamCreate(&streamToRunOn));
+
+  randValuesX = (float *)malloc(size);
+  if (!randValuesX) {
+    exit(EXIT_FAILURE);  // exit since memory allocation error
+  }
+  randValuesY = (float *)malloc(size);
+  if (!randValuesY) {
+    exit(EXIT_FAILURE);  // exit since memory allocation error
+  }
+  randValuesVerifyXmulY = (float *)malloc(size);
+  if (!randValuesVerifyXmulY) {
+    exit(EXIT_FAILURE);  // exit since memory allocation error
+  }
+  randValuesVerifyYmulX = (float *)malloc(size);
+  if (!randValuesVerifyYmulX) {
+    exit(EXIT_FAILURE);  // exit since memory allocation error
+  }
+  checkCudaErrors(cudaMalloc(&dptrA, size));
+  checkCudaErrors(cudaMalloc(&dptrB, size));
+  checkCudaErrors(cudaMalloc(&dptrC, size));
+
+  fillMatrixWithRandomValues(randValuesX, matrixDim);
+  fillMatrixWithRandomValues(randValuesY, matrixDim);
+
+  checkCudaErrors(
+      cudaMemcpyAsync(dptrA, randValuesX, size, cudaMemcpyHostToDevice));
+  checkCudaErrors(
+      cudaMemcpyAsync(dptrB, randValuesY, size, cudaMemcpyHostToDevice));
+  matrixMultiplyKernel<<<grid, threads>>>(dptrC, dptrA, dptrB, matrixDim);
+  checkCudaErrors(cudaMemcpyAsync(randValuesVerifyXmulY, dptrC, size,
+                                  cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaStreamSynchronize(NULL));
+  matrixMultiplyKernel<<<grid, threads>>>(dptrC, dptrB, dptrA, matrixDim);
+  checkCudaErrors(cudaMemcpyAsync(randValuesVerifyYmulX, dptrC, size,
+                                  cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaStreamSynchronize(NULL));
+#if VERIFY_GPU_CORRECTNESS
+  verifyMatrixMultiplyCorrectness(randValuesVerifyXmulY, randValuesX,
+                                  randValuesY, matrixDim);
+  verifyMatrixMultiplyCorrectness(randValuesVerifyYmulX, randValuesY,
+                                  randValuesX, matrixDim);
+#endif
+  checkCudaErrors(cudaFree(dptrA));
+  checkCudaErrors(cudaFree(dptrB));
+  checkCudaErrors(cudaFree(dptrC));
+
+  checkCudaErrors(cudaMallocHost(&latch, sizeof(unsigned int)));
+
+  switch (allocType) {
+    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY:
+    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC:
+      hptrA = (float *)malloc(size);
+      if (!hptrA) {
+        exit(EXIT_FAILURE);  // exit since memory allocation error
+      }
+      hptrB = (float *)malloc(size);
+      if (!hptrB) {
+        exit(EXIT_FAILURE);  // exit since memory allocation error
+      }
+      hptrC = (float *)malloc(size);
+      if (!hptrC) {
+        exit(EXIT_FAILURE);  // exit since memory allocation error
+      }
+      checkCudaErrors(cudaMalloc(&dptrA, size));
+      checkCudaErrors(cudaMalloc(&dptrB, size));
+      checkCudaErrors(cudaMalloc(&dptrC, size));
+      copyRequired = true;
+      break;
+
+    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY:
+    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC:
+      checkCudaErrors(cudaMallocHost(&hptrA, size));
+      checkCudaErrors(cudaMallocHost(&hptrB, size));
+      checkCudaErrors(cudaMallocHost(&hptrC, size));
+      checkCudaErrors(cudaMalloc(&dptrA, size));
+      checkCudaErrors(cudaMalloc(&dptrB, size));
+      checkCudaErrors(cudaMalloc(&dptrC, size));
+      copyRequired = true;
+      break;
+
+    case USE_ZERO_COPY:
+      checkCudaErrors(cudaMallocHost(&hptrA, size));
+      checkCudaErrors(cudaMallocHost(&hptrB, size));
+      checkCudaErrors(cudaMallocHost(&hptrC, size));
+      checkCudaErrors(cudaHostGetDevicePointer(&dptrA, hptrA, 0));
+      checkCudaErrors(cudaHostGetDevicePointer(&dptrB, hptrB, 0));
+      checkCudaErrors(cudaHostGetDevicePointer(&dptrC, hptrC, 0));
+      break;
+
+    case USE_MANAGED_MEMORY:
+      checkCudaErrors(cudaMallocManaged(&dptrA, size));
+      checkCudaErrors(cudaMallocManaged(&dptrB, size));
+      checkCudaErrors(cudaMallocManaged(&dptrC, size));
+      hptrA = dptrA;
+      hptrB = dptrB;
+      hptrC = dptrC;
+      break;
+
+    case USE_MANAGED_MEMORY_WITH_HINTS:
+    case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC:
+      if (deviceProp.concurrentManagedAccess) {
+        checkCudaErrors(cudaMallocManaged(&dptrA, size));
+        checkCudaErrors(cudaMallocManaged(&dptrB, size));
+        checkCudaErrors(cudaMallocManaged(&dptrC, size));
+        checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId));
+        checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId));
+        checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId));
+      } else {
+        checkCudaErrors(cudaMallocManaged(&dptrA, size, cudaMemAttachHost));
+        checkCudaErrors(cudaMallocManaged(&dptrB, size, cudaMemAttachHost));
+        checkCudaErrors(cudaMallocManaged(&dptrC, size, cudaMemAttachHost));
+      }
+      hptrA = dptrA;
+      hptrB = dptrB;
+      hptrC = dptrC;
+      hintsRequired = true;
+      break;
+
+    default:
+      exit(EXIT_FAILURE);  // exit with error
+  }
+
+  if (allocType == USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC ||
+      allocType == USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC ||
+      allocType == USE_MANAGED_MEMORY_WITH_HINTS_ASYNC) {
+    isAsync = true;
+  }
+
+  someTransferOpRequired = copyRequired || hintsRequired;
+
+  // fill buffers with 0 to avoid any first access page-fault overheads.
+  memset(hptrA, 0, size);
+  memset(hptrB, 0, size);
+  memset(hptrC, 0, size);
+
+  for (i = 0; i < numLoops; i++) {
+    cpuAccessTimes[i] = 0.0;
+    gpuLaunchCallsTimes[i] = 0.0;
+    gpuTransferToCallsTimes[i] = 0.0;
+    gpuTransferFromCallsTimes[i] = 0.0;
+
+    sdkStartTimer(&cpuAccessTimer);
+    {
+      copyMatrix(hptrA, (i & 0x1 == 0) ? randValuesX : randValuesY, matrixDim);
+      copyMatrix(hptrB, (i & 0x1 == 0) ? randValuesY : randValuesX, matrixDim);
+    }
+    sdkStopTimer(&cpuAccessTimer);
+    cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer);
+    sdkResetTimer(&cpuAccessTimer);
+
+    if (isAsync && hintsRequired) {
+      *latch = 0;
+      // Prevent any work on stream from starting until all work is pushed
+      spinWhileLessThanOne<<<1, 1, 0, streamToRunOn>>>(latch);
+    }
+
+    if (someTransferOpRequired) {
+      sdkStartTimer(&gpuTransferCallsTimer);
+      if (copyRequired) {
+        if (isAsync) {
+          checkCudaErrors(cudaMemcpyAsync(
+              dptrA, hptrA, size, cudaMemcpyHostToDevice, streamToRunOn));
+          checkCudaErrors(cudaMemcpyAsync(
+              dptrB, hptrB, size, cudaMemcpyHostToDevice, streamToRunOn));
+        } else {
+          checkCudaErrors(
+              cudaMemcpy(dptrA, hptrA, size, cudaMemcpyHostToDevice));
+          checkCudaErrors(
+              cudaMemcpy(dptrB, hptrB, size, cudaMemcpyHostToDevice));
+        }
+      }
+      if (hintsRequired) {
+        if (deviceProp.concurrentManagedAccess) {
+          checkCudaErrors(
+              cudaMemPrefetchAsync(dptrA, size, device_id, streamToRunOn));
+          checkCudaErrors(
+              cudaMemPrefetchAsync(dptrB, size, device_id, streamToRunOn));
+          checkCudaErrors(
+              cudaMemPrefetchAsync(dptrC, size, device_id, streamToRunOn));
+        } else {
+          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0,
+                                                   cudaMemAttachGlobal));
+          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0,
+                                                   cudaMemAttachGlobal));
+          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0,
+                                                   cudaMemAttachGlobal));
+        }
+        if (!isAsync) {
+          checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
+        }
+      }
+
+      sdkStopTimer(&gpuTransferCallsTimer);
+      gpuTransferToCallsTimes[i] +=
+          sdkGetAverageTimerValue(&gpuTransferCallsTimer);
+      sdkResetTimer(&gpuTransferCallsTimer);
+    }
+
+    sdkStartTimer(&gpuLaunchCallsTimer);
+    {
+      matrixMultiplyKernel<<<grid, threads, 0, streamToRunOn>>>(
+          dptrC, dptrA, dptrB, matrixDim);
+      if (!isAsync) {
+        checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
+      }
+    }
+    sdkStopTimer(&gpuLaunchCallsTimer);
+
+    gpuLaunchCallsTimes[i] += sdkGetAverageTimerValue(&gpuLaunchCallsTimer);
+    sdkResetTimer(&gpuLaunchCallsTimer);
+
+    if (someTransferOpRequired) {
+      sdkStartTimer(&gpuTransferCallsTimer);
+      if (hintsRequired) {
+        if (deviceProp.concurrentManagedAccess) {
+          checkCudaErrors(cudaMemPrefetchAsync(dptrA, size, cudaCpuDeviceId));
+          checkCudaErrors(cudaMemPrefetchAsync(dptrB, size, cudaCpuDeviceId));
+          checkCudaErrors(cudaMemPrefetchAsync(dptrC, size, cudaCpuDeviceId));
+        } else {
+          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrA, 0,
+                                                   cudaMemAttachHost));
+          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrB, 0,
+                                                   cudaMemAttachHost));
+          checkCudaErrors(cudaStreamAttachMemAsync(streamToRunOn, dptrC, 0,
+                                                   cudaMemAttachHost));
+        }
+        if (!isAsync) {
+          checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
+        }
+      }
+      if (copyRequired) {
+        if (isAsync) {
+          checkCudaErrors(cudaMemcpyAsync(
+              hptrC, dptrC, size, cudaMemcpyDeviceToHost, streamToRunOn));
+        } else {
+          checkCudaErrors(
+              cudaMemcpy(hptrC, dptrC, size, cudaMemcpyDeviceToHost));
+        }
+      }
+      sdkStopTimer(&gpuTransferCallsTimer);
+      gpuTransferFromCallsTimes[i] +=
+          sdkGetAverageTimerValue(&gpuTransferCallsTimer);
+      sdkResetTimer(&gpuTransferCallsTimer);
+    }
+    gpuLaunchAndTransferCallsTimes[i] = gpuLaunchCallsTimes[i] +
+                                        gpuTransferToCallsTimes[i] +
+                                        gpuTransferFromCallsTimes[i];
+    gpuLaunchTransferSyncTimes[i] = gpuLaunchAndTransferCallsTimes[i];
+    if (isAsync) {
+      sdkStartTimer(&gpuSyncTimer);
+      {
+        if (hintsRequired) {
+          *latch = 1;
+        }
+        checkCudaErrors(cudaStreamSynchronize(streamToRunOn));
+      }
+      sdkStopTimer(&gpuSyncTimer);
+      gpuLaunchTransferSyncTimes[i] += sdkGetAverageTimerValue(&gpuSyncTimer);
+      sdkResetTimer(&gpuSyncTimer);
+    }
+
+    sdkStartTimer(&cpuAccessTimer);
+    {
+      verifyMatrixData(
+          (i & 0x1 == 0) ? randValuesVerifyXmulY : randValuesVerifyYmulX, hptrC,
+          matrixDim);
+    }
+    sdkStopTimer(&cpuAccessTimer);
+    cpuAccessTimes[i] += sdkGetAverageTimerValue(&cpuAccessTimer);
+    sdkResetTimer(&cpuAccessTimer);
+    overallTimes[i] = cpuAccessTimes[i] + gpuLaunchTransferSyncTimes[i];
+  }
+
+  switch (allocType) {
+    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY:
+    case USE_HOST_PAGEABLE_AND_DEVICE_MEMORY_ASYNC:
+      free(hptrA);
+      free(hptrB);
+      free(hptrC);
+      checkCudaErrors(cudaFree(dptrA));
+      checkCudaErrors(cudaFree(dptrB));
+      checkCudaErrors(cudaFree(dptrC));
+      break;
+
+    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY:
+    case USE_HOST_PAGELOCKED_AND_DEVICE_MEMORY_ASYNC:
+      checkCudaErrors(cudaFreeHost(hptrA));
+      checkCudaErrors(cudaFreeHost(hptrB));
+      checkCudaErrors(cudaFreeHost(hptrC));
+      checkCudaErrors(cudaFree(dptrA));
+      checkCudaErrors(cudaFree(dptrB));
+      checkCudaErrors(cudaFree(dptrC));
+      break;
+
+    case USE_ZERO_COPY:
+      checkCudaErrors(cudaFreeHost(hptrA));
+      checkCudaErrors(cudaFreeHost(hptrB));
+      checkCudaErrors(cudaFreeHost(hptrC));
+      break;
+
+    case USE_MANAGED_MEMORY:
+    case USE_MANAGED_MEMORY_WITH_HINTS:
+    case USE_MANAGED_MEMORY_WITH_HINTS_ASYNC:
+      checkCudaErrors(cudaFree(dptrA));
+      checkCudaErrors(cudaFree(dptrB));
+      checkCudaErrors(cudaFree(dptrC));
+      break;
+
+    default:
+      exit(EXIT_FAILURE);  // exit due to error
+  }
+
+  checkCudaErrors(cudaStreamDestroy(streamToRunOn));
+  checkCudaErrors(cudaFreeHost(latch));
+  free(randValuesX);
+  free(randValuesY);
+  free(randValuesVerifyXmulY);
+  free(randValuesVerifyYmulX);
+  sdkDeleteTimer(&gpuLaunchCallsTimer);
+  sdkDeleteTimer(&gpuTransferCallsTimer);
+  sdkDeleteTimer(&gpuSyncTimer);
+  sdkDeleteTimer(&cpuAccessTimer);
+}
+
+void matrixMultiplyPerfRunner(bool reportAsBandwidth,
+                              bool print_launch_transfer_results,
+                              bool print_std_deviation, int device_id) {
+  int i;
+  unsigned int minMatrixDim = 32;
+  unsigned int multiplierDim = 2;
+  unsigned int matrixDim;
+  unsigned int minSize = minMatrixDim * minMatrixDim * sizeof(float);
+  unsigned int maxSize =
+      (maxSampleSizeInMb * ONE_MB) /
+      4;  // 3 buffers are used, but dividing by 4 (power of 2)
+  unsigned int multiplier = multiplierDim * multiplierDim;
+  unsigned int numSizesToTest;
+
+  struct testResults *results;
+  struct resultsData *gpuLaunchCallsTimes;
+  struct resultsData *gpuTransferToCallsTimes;
+  struct resultsData *gpuTransferFromCallsTimes;
+  struct resultsData *gpuLaunchAndTransferCallsTimes;
+  struct resultsData *gpuLaunchTransferSyncTimes;
+  struct resultsData *cpuAccessTimes;
+  struct resultsData *overallTimes;
+  unsigned long *sizesToTest;
+  unsigned int j;
+
+  numSizesToTest = findNumSizesToTest(minSize, maxSize, multiplier);
+
+  createAndInitTestResults(&results, "matrixMultiplyPerf", numKernelRuns,
+                           numSizesToTest);
+
+  sizesToTest = getPtrSizesToTest(results);
+
+  createResultDataAndAddToTestResults(&gpuLaunchCallsTimes, results,
+                                      "GPU Kernel Launch Call Time", false,
+                                      reportAsBandwidth);
+  createResultDataAndAddToTestResults(&gpuTransferToCallsTimes, results,
+                                      "CPU to GPU Transfer Calls Time", false,
+                                      reportAsBandwidth);
+  createResultDataAndAddToTestResults(&gpuTransferFromCallsTimes, results,
+                                      "GPU to CPU Transfer Calls Time", false,
+                                      reportAsBandwidth);
+  createResultDataAndAddToTestResults(&gpuLaunchAndTransferCallsTimes, results,
+                                      "GPU Launch and Transfer Calls Time",
+                                      false, reportAsBandwidth);
+  createResultDataAndAddToTestResults(&gpuLaunchTransferSyncTimes, results,
+                                      "GPU Launch Transfer and Sync Time",
+                                      false, reportAsBandwidth);
+  createResultDataAndAddToTestResults(
+      &cpuAccessTimes, results, "CPU Access Time", false, reportAsBandwidth);
+  createResultDataAndAddToTestResults(&overallTimes, results, "Overall Time",
+                                      false, reportAsBandwidth);
+
+  printf("Running ");
+  for (matrixDim = minMatrixDim, j = 0;
+       matrixDim * matrixDim <= maxSize / sizeof(float);
+       matrixDim *= multiplierDim, ++j) {
+    sizesToTest[j] = matrixDim * matrixDim * sizeof(float);
+    for (i = MEMALLOC_TYPE_START; i <= MEMALLOC_TYPE_END; i++) {
+      printf(".");
+      fflush(stdout);
+      runMatrixMultiplyKernel(
+          matrixDim, i, numKernelRuns,
+          getPtrRunTimesInMs(gpuLaunchCallsTimes, i, j),
+          getPtrRunTimesInMs(gpuTransferToCallsTimes, i, j),
+          getPtrRunTimesInMs(gpuTransferFromCallsTimes, i, j),
+          getPtrRunTimesInMs(gpuLaunchAndTransferCallsTimes, i, j),
+          getPtrRunTimesInMs(gpuLaunchTransferSyncTimes, i, j),
+          getPtrRunTimesInMs(cpuAccessTimes, i, j),
+          getPtrRunTimesInMs(overallTimes, i, j), device_id);
+    }
+  }
+  printf("\n");
+  printResults(results, print_launch_transfer_results, print_std_deviation);
+  freeTestResultsAndAllResultsData(results);
+}
+
+static void usage() {
+  printf(
+      "./cudaMemoryTypesPerf [-device=<device_id>] [-reportAsBandwidth] "
+      "[-print-launch-transfer-results] [-print-std-deviation] [-verbose]\n");
+  printf("Options:\n");
+  printf(
+      "-reportAsBandwidth:             By default time taken is printed, this "
+      "option allows to instead print bandwidth.\n");
+  printf(
+      "-print-launch-transfer-results: By default overall results are printed, "
+      "this option allows to print data transfers and kernel time as well.\n");
+  printf(
+      "-print-std-deviation:           Prints std deviation of the results.\n");
+  printf(
+      "-kernel-iterations=<num>:       Number of times the kernel tests should "
+      "be run[default is 100 iterations].\n");
+  printf(
+      "-device=<device_id>:            Allows to pass GPU Device ID on which "
+      "the tests will be run.\n");
+  printf("-verbose:                       Prints highly verbose output.\n");
+}
+
+int main(int argc, char **argv) {
+  bool reportAsBandwidth = false;
+  bool print_launch_transfer_results = false;
+  bool print_std_deviation = false;
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
+      checkCmdLineFlag(argc, (const char **)argv, "h")) {
+    usage();
+    printf("&&&& %s WAIVED\n", argv[0]);
+    exit(EXIT_WAIVED);
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "reportAsBandwidth")) {
+    reportAsBandwidth = true;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv,
+                       "print-launch-transfer-results")) {
+    print_launch_transfer_results = true;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "print-std-deviation")) {
+    print_std_deviation = true;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "kernel-iterations")) {
+    numKernelRuns =
+        getCmdLineArgumentInt(argc, (const char **)argv, "kernel-iterations");
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "verbose")) {
+    verboseResults = 1;
+  }
+
+  int device_id = findCudaDevice(argc, (const char **)argv);
+
+  matrixMultiplyPerfRunner(reportAsBandwidth, print_launch_transfer_results,
+                           print_std_deviation, device_id);
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n");
+  exit(EXIT_SUCCESS);
+}
--- a/Samples/conjugateGradientCudaGraphs/Makefile
+++ b/Samples/conjugateGradientCudaGraphs/Makefile
@ -0,0 +1,302 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+SMS ?= 30 35 37 50 52 60 61 70 75
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+LIBRARIES += -lcublas_static -lcusparse_static -lculibos
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: conjugateGradientCudaGraphs
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+conjugateGradientCudaGraphs.o:conjugateGradientCudaGraphs.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+conjugateGradientCudaGraphs: conjugateGradientCudaGraphs.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./conjugateGradientCudaGraphs
+
+clean:
+	rm -f conjugateGradientCudaGraphs conjugateGradientCudaGraphs.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/conjugateGradientCudaGraphs
+
+clobber: clean
--- a/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
+++ b/Samples/conjugateGradientCudaGraphs/NsightEclipse.xml
@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>conjugateGradientCudaGraphs</name>
+  <cuda_api_list>
+    <toolkit>cudaStreamBeginCapture</toolkit>
+    <toolkit>cudaStreamEndCapture</toolkit>
+    <toolkit>cudaGraphCreate</toolkit>
+    <toolkit>cudaGraphLaunch</toolkit>
+    <toolkit>cudaGraphInstantiate</toolkit>
+    <toolkit>cudaGraphExecDestroy</toolkit>
+    <toolkit>cudaGraphDestroy</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This sample implements a conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="advanced">Linear Algebra</concept>
+    <concept level="advanced">CUBLAS Library</concept>
+    <concept level="advanced">CUSPARSE Library</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>CUBLAS</keyword>
+    <keyword>CUSPARSE</keyword>
+    <keyword>Sparse Matrix</keyword>
+  </keywords>
+  <libraries>
+    <library>cublas_static</library>
+    <library>cusparse_static</library>
+    <library>culibos</library>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>conjugateGradientCudaGraphs.cu</primary_file>
+  <required_dependencies>
+    <dependency>CUBLAS</dependency>
+    <dependency>CUSPARSE</dependency>
+  </required_dependencies>
+  <scopes>
+    <scope>1:CUDA Advanced Topics</scope>
+    <scope>3:Linear Algebra</scope>
+    <scope>1:CUDA Graphs</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Conjugate Gradient using Cuda Graphs</title>
+  <type>exe</type>
+</entry>
--- a/Samples/conjugateGradientCudaGraphs/README.md
+++ b/Samples/conjugateGradientCudaGraphs/README.md
@ -0,0 +1,98 @@
+# conjugateGradientCudaGraphs - Conjugate Gradient using Cuda Graphs
+
+## Description
+
+This sample implements a conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs.
+
+## Key Concepts
+
+Linear Algebra, CUBLAS Library, CUSPARSE Library
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, cudaGraphInstantiate, cudaGraphExecDestroy, cudaGraphDestroy
+
+## Dependencies needed to build/run
+[CUBLAS](../../README.md#cublas), [CUSPARSE](../../README.md#cusparse)
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Make sure the dependencies mentioned in [Dependencies]() section above are installed.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs.cu
@ -0,0 +1,466 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+/*
+ * This sample implements a conjugate gradient solver on GPU
+ * using CUBLAS and CUSPARSE with CUDA Graphs
+ *
+ */
+
+// includes, system
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Using updated (v2) interfaces to cublas */
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include <cooperative_groups.h>
+
+// Utilities and system includes
+#include <helper_cuda.h>  // helper function CUDA error checking and initialization
+#include <helper_functions.h>  // helper for shared functions common to CUDA Samples
+
+namespace cg = cooperative_groups;
+
+const char *sSDKname = "conjugateGradientCudaGraphs";
+
+#ifndef WITH_GRAPH
+#define WITH_GRAPH 1
+#endif
+
+/* genTridiag: generate a random tridiagonal symmetric matrix */
+void genTridiag(int *I, int *J, float *val, int N, int nz) {
+  I[0] = 0, J[0] = 0, J[1] = 1;
+  val[0] = (float)rand() / RAND_MAX + 10.0f;
+  val[1] = (float)rand() / RAND_MAX;
+  int start;
+
+  for (int i = 1; i < N; i++) {
+    if (i > 1) {
+      I[i] = I[i - 1] + 3;
+    } else {
+      I[1] = 2;
+    }
+
+    start = (i - 1) * 3 + 2;
+    J[start] = i - 1;
+    J[start + 1] = i;
+
+    if (i < N - 1) {
+      J[start + 2] = i + 1;
+    }
+
+    val[start] = val[start - 1];
+    val[start + 1] = (float)rand() / RAND_MAX + 10.0f;
+
+    if (i < N - 1) {
+      val[start + 2] = (float)rand() / RAND_MAX;
+    }
+  }
+
+  I[N] = nz;
+}
+
+__global__ void initVectors(float *rhs, float *x, int N) {
+  size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (size_t i = gid; i < N; i += gridDim.x * blockDim.x) {
+    rhs[i] = 1.0;
+    x[i] = 0.0;
+  }
+}
+
+__global__ void gpuDotProduct(float *vecA, float *vecB, float *result,
+                              int size) {
+  cg::thread_block cta = cg::this_thread_block();
+
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  extern __shared__ double tmp[];
+
+  double temp_sum = 0.0;
+  for (int i = gid; i < size; i += gridDim.x * blockDim.x) {
+    temp_sum += (double)(vecA[i] * vecB[i]);
+  }
+  tmp[cta.thread_rank()] = temp_sum;
+
+  cg::sync(cta);
+
+  cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
+
+  double beta = temp_sum;
+  double temp;
+
+  for (int i = tile32.size() / 2; i > 0; i >>= 1) {
+    if (tile32.thread_rank() < i) {
+      temp = tmp[cta.thread_rank() + i];
+      beta += temp;
+      tmp[cta.thread_rank()] = beta;
+    }
+    cg::sync(tile32);
+  }
+  cg::sync(cta);
+
+  if (cta.thread_rank() == 0) {
+    beta = 0.0;
+    for (int i = 0; i < cta.size(); i += tile32.size()) {
+      beta += tmp[i];
+    }
+    atomicAdd(result, (float)beta);
+  }
+}
+
+__global__ void gpuSpMV(int *I, int *J, float *val, int nnz, int num_rows,
+                        float alpha, float *inputVecX, float *outputVecY) {
+  size_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (size_t i = gid; i < num_rows; i += blockDim.x * gridDim.x) {
+    int row_elem = I[i];
+    int next_row_elem = I[i + 1];
+    int num_elems_this_row = next_row_elem - row_elem;
+
+    float output = 0.0;
+    for (int j = 0; j < num_elems_this_row; j++) {
+      output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]];
+    }
+
+    outputVecY[i] = output;
+  }
+}
+
+__global__ void r1_div_x(float *r1, float *r0, float *b) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid == 0) {
+    b[0] = r1[0] / r0[0];
+  }
+}
+
+__global__ void a_minus(float *a, float *na) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid == 0) {
+    na[0] = -(a[0]);
+  }
+}
+
+int main(int argc, char **argv) {
+  int N = 0, nz = 0, *I = NULL, *J = NULL;
+  float *val = NULL;
+  const float tol = 1e-5f;
+  const int max_iter = 10000;
+  float *x;
+  float *rhs;
+  float r1;
+
+  int *d_col, *d_row;
+  float *d_val, *d_x;
+  float *d_r, *d_p, *d_Ax;
+  int k;
+  float alpha, beta, alpham1;
+
+  cudaStream_t stream1, streamForGraph;
+
+  // This will pick the best possible CUDA capable device
+  cudaDeviceProp deviceProp;
+  int devID = findCudaDevice(argc, (const char **)argv);
+
+  if (devID < 0) {
+    printf("exiting...\n");
+    exit(EXIT_SUCCESS);
+  }
+
+  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
+
+  // Statistics about the GPU device
+  printf(
+      "> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
+      deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
+
+  /* Generate a random tridiagonal symmetric matrix in CSR format */
+  N = 1048576;
+  nz = (N - 2) * 3 + 4;
+  I = (int *)malloc(sizeof(int) * (N + 1));
+  J = (int *)malloc(sizeof(int) * nz);
+  val = (float *)malloc(sizeof(float) * nz);
+  genTridiag(I, J, val, N, nz);
+
+  x = (float *)malloc(sizeof(float) * N);
+  rhs = (float *)malloc(sizeof(float) * N);
+
+  for (int i = 0; i < N; i++) {
+    rhs[i] = 1.0;
+    x[i] = 0.0;
+  }
+
+  /* Get handle to the CUBLAS context */
+  cublasHandle_t cublasHandle = 0;
+  cublasStatus_t cublasStatus;
+  cublasStatus = cublasCreate(&cublasHandle);
+
+  checkCudaErrors(cublasStatus);
+
+  /* Get handle to the CUSPARSE context */
+  cusparseHandle_t cusparseHandle = 0;
+  cusparseStatus_t cusparseStatus;
+  cusparseStatus = cusparseCreate(&cusparseHandle);
+
+  checkCudaErrors(cusparseStatus);
+
+  checkCudaErrors(cudaStreamCreate(&stream1));
+
+  checkCudaErrors(cudaMalloc((void **)&d_col, nz * sizeof(int)));
+  checkCudaErrors(cudaMalloc((void **)&d_row, (N + 1) * sizeof(int)));
+  checkCudaErrors(cudaMalloc((void **)&d_val, nz * sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_x, N * sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_r, N * sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_p, N * sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_Ax, N * sizeof(float)));
+
+  float *d_r1, *d_r0, *d_dot, *d_a, *d_na, *d_b;
+  checkCudaErrors(cudaMalloc((void **)&d_r1, sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_r0, sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_dot, sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_a, sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_na, sizeof(float)));
+  checkCudaErrors(cudaMalloc((void **)&d_b, sizeof(float)));
+
+  cusparseMatDescr_t descr = 0;
+  checkCudaErrors(cusparseCreateMatDescr(&descr));
+
+  checkCudaErrors(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+  checkCudaErrors(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+
+  int numBlocks = 0, blockSize = 0, numBlocks2 = 0, blockSize2 = 0;
+  checkCudaErrors(
+      cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, initVectors));
+
+  checkCudaErrors(cudaMemcpyAsync(d_col, J, nz * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream1));
+  checkCudaErrors(cudaMemcpyAsync(d_row, I, (N + 1) * sizeof(int),
+                                  cudaMemcpyHostToDevice, stream1));
+  checkCudaErrors(cudaMemcpyAsync(d_val, val, nz * sizeof(float),
+                                  cudaMemcpyHostToDevice, stream1));
+
+  initVectors<<<numBlocks, blockSize, 0, stream1>>>(d_r, d_x, N);
+
+  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&numBlocks2, &blockSize2,
+                                                     gpuSpMV));
+  checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize,
+                                                     gpuDotProduct));
+
+  alpha = 1.0;
+  alpham1 = -1.0;
+  beta = 0.0;
+
+  checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
+  checkCudaErrors(
+      cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
+                     &alpha, descr, d_val, d_row, d_col, d_x, &beta, d_Ax));
+
+  checkCudaErrors(cublasSetStream(cublasHandle, stream1));
+  checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpham1, d_Ax, 1, d_r, 1));
+
+  checkCudaErrors(
+      cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE));
+  checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1));
+
+  k = 1;
+  // First Iteration when k=1 starts
+  checkCudaErrors(cublasScopy(cublasHandle, N, d_r, 1, d_p, 1));
+  checkCudaErrors(
+      cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
+                     &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
+
+  checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
+
+  r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a);
+
+  checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1));
+
+  a_minus<<<1, 1, 0, stream1>>>(d_a, d_na);
+
+  checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1));
+
+  checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float),
+                                  cudaMemcpyDeviceToDevice, stream1));
+
+  checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1));
+
+  checkCudaErrors(cudaMemcpyAsync(&r1, d_r1, sizeof(float),
+                                  cudaMemcpyDeviceToHost, stream1));
+  checkCudaErrors(cudaStreamSynchronize(stream1));
+  printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
+  // First Iteration when k=1 ends
+  k++;
+
+#if WITH_GRAPH
+  cudaGraph_t initGraph;
+  checkCudaErrors(cudaStreamCreate(&streamForGraph));
+  checkCudaErrors(cublasSetStream(cublasHandle, stream1));
+  checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
+  checkCudaErrors(cudaStreamBeginCapture(stream1));
+
+  r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
+  cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
+  checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1));
+  cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST);
+  checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1));
+  cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
+
+#if 0 // Use cusparseScsrmv API when it is cuda graph compliant
+  checkCudaErrors(
+      cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
+  checkCudaErrors(
+      cusparseScsrmv(cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
+                     &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
+#else
+  gpuSpMV<<<numBlocks2, blockSize2, 0, stream1>>>(d_row, d_col, d_val, nz,
+                                                        N, alpha, d_p, d_Ax);
+#endif
+
+  checkCudaErrors(cudaMemsetAsync(d_dot, 0, sizeof(float), stream1));
+  // Use cublasSdot API when it is cuda graph compliant.
+  // checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
+  gpuDotProduct<<<numBlocks, blockSize, blockSize * sizeof(double), stream1>>>(
+      d_p, d_Ax, d_dot, N);
+
+  r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a);
+
+  checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1));
+
+  a_minus<<<1, 1, 0, stream1>>>(d_a, d_na);
+
+  checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1));
+
+  checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float),
+                                  cudaMemcpyDeviceToDevice, stream1));
+  checkCudaErrors(cudaMemsetAsync(d_r1, 0, sizeof(float), stream1));
+  // Use cublasSdot API when it is cuda graph compliant.
+  // checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1));
+  gpuDotProduct<<<numBlocks, blockSize, blockSize * sizeof(double), stream1>>>(
+      d_r, d_r, d_r1, N);
+  checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float),
+                                  cudaMemcpyDeviceToHost, stream1));
+
+  checkCudaErrors(cudaStreamEndCapture(stream1, &initGraph));
+  cudaGraphExec_t graphExec;
+  checkCudaErrors(cudaGraphInstantiate(&graphExec, initGraph, NULL, NULL, 0));
+#endif
+
+  checkCudaErrors(cublasSetStream(cublasHandle, stream1));
+  checkCudaErrors(cusparseSetStream(cusparseHandle, stream1));
+
+  while (r1 > tol * tol && k <= max_iter) {
+#if WITH_GRAPH
+    checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph));
+    checkCudaErrors(cudaStreamSynchronize(streamForGraph));
+#else
+    r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_r0, d_b);
+    cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
+    checkCudaErrors(cublasSscal(cublasHandle, N, d_b, d_p, 1));
+
+    cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_HOST);
+    checkCudaErrors(cublasSaxpy(cublasHandle, N, &alpha, d_r, 1, d_p, 1));
+
+    checkCudaErrors(cusparseScsrmv(
+        cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz, &alpha,
+        descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
+
+    cublasSetPointerMode(cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
+    checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
+
+    r1_div_x<<<1, 1, 0, stream1>>>(d_r1, d_dot, d_a);
+
+    checkCudaErrors(cublasSaxpy(cublasHandle, N, d_a, d_p, 1, d_x, 1));
+
+    a_minus<<<1, 1, 0, stream1>>>(d_a, d_na);
+    checkCudaErrors(cublasSaxpy(cublasHandle, N, d_na, d_Ax, 1, d_r, 1));
+
+    checkCudaErrors(cudaMemcpyAsync(d_r0, d_r1, sizeof(float),
+                                    cudaMemcpyDeviceToDevice, stream1));
+
+    checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1));
+    checkCudaErrors(cudaMemcpyAsync((float *)&r1, d_r1, sizeof(float),
+                                    cudaMemcpyDeviceToHost, stream1));
+    checkCudaErrors(cudaStreamSynchronize(stream1));
+#endif
+    printf("iteration = %3d, residual = %e\n", k, sqrt(r1));
+    k++;
+  }
+
+#if WITH_GRAPH
+  checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float),
+                                  cudaMemcpyDeviceToHost, streamForGraph));
+  checkCudaErrors(cudaStreamSynchronize(streamForGraph));
+#else
+  checkCudaErrors(cudaMemcpyAsync(x, d_x, N * sizeof(float),
+                                  cudaMemcpyDeviceToHost, stream1));
+  checkCudaErrors(cudaStreamSynchronize(stream1));
+#endif
+
+  float rsum, diff, err = 0.0;
+
+  for (int i = 0; i < N; i++) {
+    rsum = 0.0;
+
+    for (int j = I[i]; j < I[i + 1]; j++) {
+      rsum += val[j] * x[J[j]];
+    }
+
+    diff = fabs(rsum - rhs[i]);
+
+    if (diff > err) {
+      err = diff;
+    }
+  }
+
+#if WITH_GRAPH
+  checkCudaErrors(cudaGraphExecDestroy(graphExec));
+  checkCudaErrors(cudaGraphDestroy(initGraph));
+  checkCudaErrors(cudaStreamDestroy(streamForGraph));
+#endif
+  checkCudaErrors(cudaStreamDestroy(stream1));
+  cusparseDestroy(cusparseHandle);
+  cublasDestroy(cublasHandle);
+
+  free(I);
+  free(J);
+  free(val);
+  free(x);
+  free(rhs);
+  cudaFree(d_col);
+  cudaFree(d_row);
+  cudaFree(d_val);
+  cudaFree(d_x);
+  cudaFree(d_r);
+  cudaFree(d_p);
+  cudaFree(d_Ax);
+
+  printf("Test Summary:  Error amount = %f\n", err);
+  exit((k <= max_iter) ? 0 : 1);
+}
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.sln
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2010.sln
@ -1,7 +1,7 @@

-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiBlockCG", "conjugateGradientMultiBlockCG_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2012.vcxproj
@ -15,14 +15,15 @@
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
-    <RootNamespace>simpleCUFFT_vs2010</RootNamespace>
-    <ProjectName>simpleCUFFT</ProjectName>
+    <RootNamespace>conjugateGradientCudaGraphs_vs2012</RootNamespace>
+    <ProjectName>conjugateGradientCudaGraphs</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
@ -32,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -56,12 +57,12 @@
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cufft.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
-      <OutputFile>$(OutDir)/simpleCUFFT.exe</OutputFile>
+      <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -96,11 +97,11 @@
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <CudaCompile Include="simpleCUFFT.cu" />
+    <CudaCompile Include="conjugateGradientCudaGraphs.cu" />

  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.sln
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2013.vcxproj
@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientCudaGraphs_vs2013</RootNamespace>
+    <ProjectName>conjugateGradientCudaGraphs</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientCudaGraphs.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.sln
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2015.vcxproj
@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientCudaGraphs_vs2015</RootNamespace>
+    <ProjectName>conjugateGradientCudaGraphs</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientCudaGraphs.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.sln
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
+++ b/Samples/conjugateGradientCudaGraphs/conjugateGradientCudaGraphs_vs2017.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientCudaGraphs_vs2017</RootNamespace>
+    <ProjectName>conjugateGradientCudaGraphs</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientCudaGraphs.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientMultiBlockCG/Makefile
+++ b/Samples/conjugateGradientMultiBlockCG/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
@ -266,7 +264,7 @@ LIBRARIES :=
 ################################################################################

 # Gencode arguments
-SMS ?= 60 61 70
+SMS ?= 60 61 70 75

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiBlockCG/NsightEclipse.xml
@ -42,6 +42,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
--- a/Samples/conjugateGradientMultiBlockCG/README.md
+++ b/Samples/conjugateGradientMultiBlockCG/README.md
@ -10,7 +10,7 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiBlock Cooperative Group

 ## Supported SM Architectures

-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

@ -27,7 +27,7 @@ x86_64, ppc64le

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## Build and Run
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiBlockCG/conjugateGradientMultiBlockCG_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -63,7 +63,7 @@
      <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -104,6 +104,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiDeviceCG/Makefile
+++ b/Samples/conjugateGradientMultiDeviceCG/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
@ -266,7 +264,7 @@ LIBRARIES :=
 ################################################################################

 # Gencode arguments
-SMS ?= 60 61 70
+SMS ?= 60 61 70 75

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
+++ b/Samples/conjugateGradientMultiDeviceCG/NsightEclipse.xml
@ -49,6 +49,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
@ -58,6 +59,9 @@
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
+    <env>
+      <platform>windows</platform>
+    </env>
  </supported_envs>
  <supported_sm_architectures>
    <from>6.0</from>
--- a/Samples/conjugateGradientMultiDeviceCG/README.md
+++ b/Samples/conjugateGradientMultiDeviceCG/README.md
@ -10,11 +10,11 @@ Unified Memory, Linear Algebra, Cooperative Groups, MultiDevice Cooperative Grou

 ## Supported SM Architectures

-[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

-Linux
+Linux, Windows

 ## Supported CPU Architecture

@ -30,11 +30,21 @@ cudaMemAdvise, cudaMemPrefetchAsync, cudaLaunchCooperativeKernelMultiDevice, cud

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
 Make sure the dependencies mentioned in [Dependencies]() section above are installed.

 ## Build and Run

+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
 ### Linux
 The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
 ```
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG.cu
@ -414,7 +414,8 @@ void getIdenticalGPUs(int num_of_gpus, std::set<int> &identicalGPUs) {
        deviceProp.minor != maxMajorMinor[1]) {
      identicalGPUs.erase(it);
    }
-    if (!deviceProp.cooperativeMultiDeviceLaunch) {
+    if (!deviceProp.cooperativeMultiDeviceLaunch ||
+        !deviceProp.concurrentManagedAccess) {
      identicalGPUs.erase(it);
    }
    it++;
@ -449,7 +450,8 @@ int main(int argc, char **argv) {
  if (identicalGPUs.size() <= 1) {
    printf(
        "No Two or more GPUs with same architecture capable of "
-        "cooperativeMultiDeviceLaunch found. \nWaiving the sample\n");
+        "cooperativeMultiDeviceLaunch & concurrentManagedAccess found. "
+        "\nWaiving the sample\n");
    exit(EXIT_WAIVED);
  }

@ -617,9 +619,12 @@ int main(int argc, char **argv) {
      cudaCooperativeLaunchMultiDeviceNoPreSync |
          cudaCooperativeLaunchMultiDeviceNoPostSync));

-  checkCudaErrors(cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
-  checkCudaErrors(
-      cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
+  if (deviceProp.concurrentManagedAccess) {
+    checkCudaErrors(
+        cudaMemPrefetchAsync(x, sizeof(float) * N, cudaCpuDeviceId));
+    checkCudaErrors(
+        cudaMemPrefetchAsync(dot_result, sizeof(double), cudaCpuDeviceId));
+  }

  deviceId = identicalGPUs.begin();
  device_count = 0;
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.sln
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2012.vcxproj
@ -15,14 +15,15 @@
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
-    <RootNamespace>conjugateGradientMultiBlockCG_vs2010</RootNamespace>
-    <ProjectName>conjugateGradientMultiBlockCG</ProjectName>
+    <RootNamespace>conjugateGradientMultiDeviceCG_vs2012</RootNamespace>
+    <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
@ -32,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -58,10 +59,10 @@
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
-      <OutputFile>$(OutDir)/conjugateGradientMultiBlockCG.exe</OutputFile>
+      <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -97,11 +98,11 @@
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <CudaCompile Include="conjugateGradientMultiBlockCG.cu" />
+    <CudaCompile Include="conjugateGradientMultiDeviceCG.cu" />

  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.sln
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2013.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientMultiDeviceCG_vs2013</RootNamespace>
+    <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientMultiDeviceCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.sln
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2015.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientMultiDeviceCG_vs2015</RootNamespace>
+    <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientMultiDeviceCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.sln
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientMultiDeviceCG", "conjugateGradientMultiDeviceCG_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
+++ b/Samples/conjugateGradientMultiDeviceCG/conjugateGradientMultiDeviceCG_vs2017.vcxproj
@ -0,0 +1,109 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>conjugateGradientMultiDeviceCG_vs2017</RootNamespace>
+    <ProjectName>conjugateGradientMultiDeviceCG</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudadevrt.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/conjugateGradientMultiDeviceCG.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+      <GenerateRelocatableDeviceCode>true</GenerateRelocatableDeviceCode>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="conjugateGradientMultiDeviceCG.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/cudaTensorCoreGemm/Makefile
+++ b/Samples/cudaTensorCoreGemm/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
@ -266,7 +264,7 @@ LIBRARIES :=
 ################################################################################

 # Gencode arguments
-SMS ?= 70
+SMS ?= 70 75

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/cudaTensorCoreGemm/NsightEclipse.xml
+++ b/Samples/cudaTensorCoreGemm/NsightEclipse.xml
@ -43,6 +43,7 @@ In addition to that, it demonstrates the use of the new CUDA function attribute
    <scope>1:CUDA Basic Topics</scope>
  </scopes>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
--- a/Samples/cudaTensorCoreGemm/README.md
+++ b/Samples/cudaTensorCoreGemm/README.md
@ -14,7 +14,7 @@ Matrix Multiply, WMMA, Tensor Cores

 ## Supported SM Architectures

-[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

@ -31,7 +31,7 @@ cudaMallocManaged, cudaDeviceSynchronize, cudaFuncSetAttribute, cudaEventCreate,

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## Build and Run

--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm.cu
@ -72,6 +72,21 @@
 #include <helper_cuda.h>
 #include <helper_functions.h>

+// Externally configurable parameters.
+
+#ifndef CPU_DEBUG
+// Set this to 1 to verify the correctness of the GPU-computed matrix.
+#define CPU_DEBUG 0
+#endif
+
+#ifndef SHARED_MEMORY_LIMIT_64K
+// Set this to 0 to use more than 64 Kb of shared memory to cache data, to
+// improve the performance of the computations on GPU.
+// Note that you need a GPU that can have more than 64 Kb of shared memory
+// per multiprocessor.
+#define SHARED_MEMORY_LIMIT_64K 1
+#endif
+
 // GPU configuration.

 #define WARP_SIZE 32
@ -82,6 +97,10 @@
 #define N 16
 #define K 16

+#define WMMA_M 16
+#define WMMA_N 16
+#define WMMA_K 16
+
 // GEMM configuration.

 #define M_TILES 256
@ -99,7 +118,24 @@
 #define WARPS_PER_BLOCK 8
 #define THREADS_PER_BLOCK (WARP_SIZE * WARPS_PER_BLOCK)

+#if SHARED_MEMORY_LIMIT_64K
+// With only 64 Kb shared memory available, we can fit two 8-tile chunks of
+// the A and B matrix data, that are 16 * 16 * 8 * 8 * 2 = 32 Kb each
+// (i.e. two 8x8 arrays of tiles of 16x16 half-typed elements per CTA).
+// But we cannot account the 8 Kb total skew overhead, without which the
+// performance would be severely impacted. So we choose to reduce the chunk size
+// in half, i.e. the amount of A and B matrix data we cache in shared memory.
+// Accordingly, this doubles the number of outer iterations across the global K
+// dimension, which only slightly impacts the performance.
+#define CHUNK_K 4
+#else
 #define CHUNK_K 8
+#endif
+
+#define CHUNK_LINE_BYTES (CHUNK_K * K * sizeof(half))
+#define WARP_COPY_BYTES (WARP_SIZE * sizeof(int4))
+#define CHUNK_COPY_LINES_PER_WARP (WARP_COPY_BYTES / CHUNK_LINE_BYTES)
+#define CHUNK_COPY_LINE_LANES (WARP_SIZE / CHUNK_COPY_LINES_PER_WARP)

 #define BLOCK_ROW_WARPS 2
 #define BLOCK_COL_WARPS 4
@ -194,14 +230,14 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C,
  const size_t shmem_idx_b_off = BLOCK_COL_TILES * M;

  // This pointer is used to access the C and D matrix tiles this warp computes.
-  float *shmem_warp_tile_ptr = reinterpret_cast<float *>(
-      &shmem[0][0] + (warpId / 2) * SHMEM_STRIDE * K * 2 +
-      (warpId % 2) * SHMEM_OFFSET);
+  float *shmem_warp_tile_ptr = (float *)&shmem[0][0] +
+                               (warpId / 2) * SHMEM_STRIDE * K * 2 +
+                               (warpId % 2) * SHMEM_OFFSET;

  // This pointer is used to stream the C and D matrices block-wide tile to and
  // from shared memory.
  float *shmem_warp_stream_ptr =
-      reinterpret_cast<float *>(&shmem[0][0] + warpId * SHMEM_STRIDE * K);
+      (float *)&shmem[0][0] + warpId * SHMEM_STRIDE * K;

  // Adjust the beta scaler, as it'll be multiplied by alpha at the end of
  // each tile computation. Technically this is not generally correct (may
@ -292,23 +328,24 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C,
      // First half of the warp copies the first row / column of the matrix,
      // the second half of the warp copies the next.
      int4 *lane_ptr = (int4 *)(warp_ptr + tile_k * K +
-                                (laneId / (WARP_SIZE / 2)) * K_GLOBAL) +
-                       (laneId % (WARP_SIZE / 2));
+                                (laneId / CHUNK_COPY_LINE_LANES) * K_GLOBAL) +
+                       (laneId % CHUNK_COPY_LINE_LANES);

      // Shift the second half of the warp to the next row / column in the
      // shared memory.
-      shmem_idx += laneId / (WARP_SIZE / 2);
+      shmem_idx += laneId / CHUNK_COPY_LINE_LANES;

 #pragma unroll
-      for (int i = 0; i < (WARP_SIZE / 2); i++) {
+      for (int i = 0; i < ((WARP_SIZE / 2) / CHUNK_COPY_LINES_PER_WARP) * 2;
+           i++) {
        // Copy 16 bytes at once in each lane.
-        *((int4 *)&shmem[shmem_idx][0] + (laneId % (WARP_SIZE / 2))) =
+        *((int4 *)&shmem[shmem_idx][0] + (laneId % CHUNK_COPY_LINE_LANES)) =
            *lane_ptr;

        // Advance the global memory pointer and the shared memory index.
-        lane_ptr = reinterpret_cast<int4 *>(
-            reinterpret_cast<half *>(lane_ptr + K_GLOBAL * 2));
-        shmem_idx += 2;
+        lane_ptr =
+            (int4 *)((half *)lane_ptr + K_GLOBAL * CHUNK_COPY_LINES_PER_WARP);
+        shmem_idx += CHUNK_COPY_LINES_PER_WARP;
      }

      __syncthreads();
@ -374,17 +411,98 @@ __global__ void compute_gemm(const half *A, const half *B, const float *C,

 #pragma unroll
    for (int i = 0; i < K; i++) {
-      *(reinterpret_cast<int4 *>(dst_gmem_warp_stream_ptr +
-                                 GLOBAL_MEM_STRIDE * i) +
-        laneId) =
-          *(reinterpret_cast<int4 *>(shmem_warp_stream_ptr + SHMEM_STRIDE * i) +
-            laneId);
+      *((int4 *)(dst_gmem_warp_stream_ptr + GLOBAL_MEM_STRIDE * i) + laneId) =
+          *((int4 *)(shmem_warp_stream_ptr + SHMEM_STRIDE * i) + laneId);
    }

    __syncthreads();
  }
 }

+// Performs an MxNxK GEMM (C=alpha*A*B + beta*C) assuming:
+//  1) Matrices are packed in memory.
+//  2) M, N and K are multiples of 16.
+//  3) Neither A nor B are transposed.
+// Note: This is a less performant version of the compute_gemm kernel. It is
+// designed for
+//       demonstration purposes only to show the CUDA WMMA API use without
+//       relying on availability of the shared memory.
+__global__ void simple_wmma_gemm(half *a, half *b, float *c, float *d, int m_ld,
+                                 int n_ld, int k_ld, float alpha, float beta) {
+  // Leading dimensions. Packed with no transpositions.
+  int lda = m_ld;
+  int ldb = k_ld;
+  int ldc = n_ld;
+
+  // Tile using a 2D grid
+  int warpM = (blockIdx.x * blockDim.x + threadIdx.x) / warpSize;
+  int warpN = (blockIdx.y * blockDim.y + threadIdx.y);
+
+  // Declare the fragments
+  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major>
+      a_frag;
+  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major>
+      b_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> acc_frag;
+  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag;
+
+  wmma::fill_fragment(acc_frag, 0.0f);
+
+  // Loop over k
+  for (int i = 0; i < k_ld; i += WMMA_K) {
+    int aCol = i;
+    int aRow = warpM * WMMA_M;
+
+    int bCol = i;
+    int bRow = warpN * WMMA_N;
+
+    // Bounds checking
+    if (aRow < m_ld && aCol < k_ld && bRow < k_ld && bCol < n_ld) {
+      // Load the inputs
+      wmma::load_matrix_sync(a_frag, a + aCol + aRow * lda, lda);
+      wmma::load_matrix_sync(b_frag, b + bCol + bRow * ldb, ldb);
+
+      // Perform the matrix multiplication
+      wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
+    }
+  }
+
+  // Load in the current value of c, scale it by beta, and add this our result
+  // scaled by alpha
+  int cCol = warpN * WMMA_N;
+  int cRow = warpM * WMMA_M;
+
+  if (cRow < m_ld && cCol < n_ld) {
+    wmma::load_matrix_sync(c_frag, c + cCol + cRow * ldc, ldc,
+                           wmma::mem_row_major);
+
+    for (int i = 0; i < c_frag.num_elements; i++) {
+      c_frag.x[i] = alpha * acc_frag.x[i] + beta * c_frag.x[i];
+    }
+
+    // Store the output
+    wmma::store_matrix_sync(d + cCol + cRow * ldc, c_frag, ldc,
+                            wmma::mem_row_major);
+  }
+}
+
+__host__ void matMultiplyOnHost(float *A, float *B, float *C, float alpha,
+                                float beta, int numARows, int numAColumns,
+                                int numBRows, int numBColumns, int numCRows,
+                                int numCColumns) {
+  for (int i = 0; i < numCRows; i++) {
+    for (int j = 0; j < numCColumns; j++) {
+      float temp = 0.0;
+
+      for (int k = 0; k < numAColumns; k++) {
+        temp += A[i * numAColumns + k] * B[j * numBRows + k];
+      }
+
+      C[i * numCColumns + j] = temp * alpha + beta * C[i * numCColumns + j];
+    }
+  }
+}
+
 int main(int argc, char **argv) {
  printf("Initializing...\n");

@ -408,6 +526,10 @@ int main(int argc, char **argv) {
  float *A_h = NULL;
  float *B_h = NULL;
  float *C_h = NULL;
+#if CPU_DEBUG
+  float *result_hD = NULL;
+  float *result_host = NULL;
+#endif

  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&A_h),
                                    sizeof(float) * M_GLOBAL * K_GLOBAL));
@ -415,6 +537,12 @@ int main(int argc, char **argv) {
                                    sizeof(float) * K_GLOBAL * N_GLOBAL));
  checkCudaErrors(cudaMallocManaged(reinterpret_cast<void **>(&C_h),
                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
+#if CPU_DEBUG
+  checkCudaErrors(cudaMallocManaged((void **)&result_hD,
+                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
+  checkCudaErrors(cudaMallocManaged((void **)&result_host,
+                                    sizeof(float) * M_GLOBAL * N_GLOBAL));
+#endif

  half *A = NULL;
  half *B = NULL;
@ -446,16 +574,22 @@ int main(int argc, char **argv) {
  checkCudaErrors(cudaDeviceSynchronize());

  enum {
-    SHMEM_SZ =
-        sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2
+    // Compute the right amount of shared memory to request.
+    // We need shared memory to hold per-CTA C and D matrix tiles, and to cache
+    // per-CTA chunks
+    // of the A and B matrices. Therefore, the right amount to request is the
+    // maximum of those
+    // two numbers.
+    SHMEM_SZ = MAX(
+        sizeof(half) * (BLOCK_COL_TILES * M) * (CHUNK_K * K + SKEW_HALF) * 2,
+        M * (BLOCK_ROW_WARPS * WARP_ROW_TILES) * N *
+            (BLOCK_COL_WARPS * WARP_COL_TILES) * sizeof(float))
  };

  printf("Required shared memory size: %lu Kb\n", SHMEM_SZ / 1024UL);

-  checkCudaErrors(cudaFuncSetAttribute(
-      compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
-
-  printf("Computing...\n");
+  const float alpha = 1.1f;
+  const float beta = 1.2f;

  cudaEvent_t start, stop;

@ -463,16 +597,61 @@ int main(int argc, char **argv) {
  checkCudaErrors(cudaEventCreate(&stop));
  checkCudaErrors(cudaEventRecord(start));

-  const float alpha = 1.1f;
-  const float beta = 1.2f;
+  // If enough shared memory available on the GPU use high performant kernel
+  if (deviceProp.sharedMemPerMultiprocessor >= SHMEM_SZ) {
+    printf("Computing... using high performance kernel compute_gemm \n");

-  checkKernelErrors(
-      (compute_gemm<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK,
-                      SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+    checkCudaErrors(cudaFuncSetAttribute(
+        compute_gemm, cudaFuncAttributeMaxDynamicSharedMemorySize, SHMEM_SZ));
+    checkKernelErrors(
+        (compute_gemm<<<deviceProp.multiProcessorCount, THREADS_PER_BLOCK,
+                        SHMEM_SZ>>>(A, B, C, D, alpha, beta)));
+#if CPU_DEBUG
+    checkCudaErrors(cudaMemcpy(result_hD, D,
+                               sizeof(float) * M_GLOBAL * N_GLOBAL,
+                               cudaMemcpyDeviceToHost));
+#endif
+  } else {
+    dim3 gridDim;
+    dim3 blockDim;
+
+    // blockDim.x must be a multple of warpSize
+    // 128x4 means we have 16 warps and a block computes a 64x64 output tile
+    blockDim.x = 128;
+    blockDim.y = 4;
+
+    gridDim.x = (M_GLOBAL + (WMMA_M * blockDim.x / 32 - 1)) /
+                (WMMA_M * blockDim.x / 32);
+    gridDim.y = (N_GLOBAL + WMMA_N * blockDim.y - 1) / (WMMA_N * blockDim.y);
+
+    printf("Computing... using simple_wmma_gemm kernel\n");
+    simple_wmma_gemm<<<gridDim, blockDim>>>(A, B, C, D, M_GLOBAL, N_GLOBAL,
+                                            K_GLOBAL, alpha, beta);
+#if CPU_DEBUG
+    checkCudaErrors(cudaMemcpy(result_hD, D,
+                               sizeof(float) * M_GLOBAL * N_GLOBAL,
+                               cudaMemcpyDeviceToHost));
+#endif
+  }

  checkCudaErrors(cudaEventRecord(stop));
  checkCudaErrors(cudaEventSynchronize(stop));

+#if CPU_DEBUG
+  printf("Verifying correctness of the computations...\n");
+
+  memcpy(result_host, C_h, sizeof(float) * M_GLOBAL * N_GLOBAL);
+
+  matMultiplyOnHost(A_h, B_h, result_host, alpha, beta, M_GLOBAL, K_GLOBAL,
+                    K_GLOBAL, N_GLOBAL, M_GLOBAL, N_GLOBAL);
+
+  for (int i = 0; i < N_GLOBAL * M_GLOBAL; i++) {
+    if (fabs(result_hD[i] - result_host[i]) > 0.1f)
+      printf("mismatch i=%d result_hD=%f result_host=%f\n", i, result_hD[i],
+             result_host[i]);
+  }
+#endif
+
  float milliseconds = 0;

  checkCudaErrors(cudaEventElapsedTime(&milliseconds, start, stop));
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
+++ b/Samples/cudaTensorCoreGemm/cudaTensorCoreGemm_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -63,7 +63,7 @@
      <OutputFile>$(OutDir)/cudaTensorCoreGemm.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/Makefile
+++ b/Samples/deviceQuery/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
@ -248,7 +246,7 @@ LIBRARIES :=
 ################################################################################

 # Gencode arguments
-SMS ?= 30 35 37 50 52 60 61 70
+SMS ?= 30 35 37 50 52 60 61 70 75

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/deviceQuery/NsightEclipse.xml
+++ b/Samples/deviceQuery/NsightEclipse.xml
@ -39,6 +39,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
--- a/Samples/deviceQuery/README.md
+++ b/Samples/deviceQuery/README.md
@ -10,7 +10,7 @@ CUDA Runtime API, Device Query

 ## Supported SM Architectures

-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

@ -27,7 +27,7 @@ cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## Build and Run

--- a/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
+++ b/Samples/deviceQuery/deviceQuery_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -63,7 +63,7 @@
      <OutputFile>$(OutDir)/deviceQuery.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/Makefile
+++ b/Samples/matrixMul/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
@ -248,7 +246,7 @@ LIBRARIES :=
 ################################################################################

 # Gencode arguments
-SMS ?= 30 35 37 50 52 60 61 70
+SMS ?= 30 35 37 50 52 60 61 70 75

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/matrixMul/NsightEclipse.xml
+++ b/Samples/matrixMul/NsightEclipse.xml
@ -46,6 +46,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
--- a/Samples/matrixMul/README.md
+++ b/Samples/matrixMul/README.md
@ -10,7 +10,7 @@ CUDA Runtime API, Linear Algebra

 ## Supported SM Architectures

-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

@ -27,7 +27,7 @@ cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventEla

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## Build and Run

--- a/Samples/matrixMul/matrixMul_vs2012.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/matrixMul_vs2013.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/matrixMul_vs2015.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -102,6 +102,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMul/matrixMul_vs2017.vcxproj
+++ b/Samples/matrixMul/matrixMul_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -63,7 +63,7 @@
      <OutputFile>$(OutDir)/matrixMul.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/Makefile
+++ b/Samples/matrixMulDrv/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
--- a/Samples/matrixMulDrv/README.md
+++ b/Samples/matrixMulDrv/README.md
@ -10,7 +10,7 @@ CUDA Driver API, Matrix Multiply

 ## Supported SM Architectures

-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

@ -27,7 +27,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## Build and Run

--- a/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -106,6 +106,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
+++ b/Samples/matrixMulDrv/matrixMulDrv_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -107,6 +107,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/p2pBandwidthLatencyTest/Makefile
+++ b/Samples/p2pBandwidthLatencyTest/Makefile
@ -0,0 +1,300 @@
+################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+################################################################################
+#
+# Makefile project only supported on Mac OS X and Linux Platforms)
+#
+################################################################################
+
+# Location of the CUDA Toolkit
+CUDA_PATH ?= /usr/local/cuda
+
+##############################
+# start deprecated interface #
+##############################
+ifeq ($(x86_64),1)
+    $(info WARNING - x86_64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=x86_64 instead)
+    TARGET_ARCH ?= x86_64
+endif
+ifeq ($(ARMv7),1)
+    $(info WARNING - ARMv7 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=armv7l instead)
+    TARGET_ARCH ?= armv7l
+endif
+ifeq ($(aarch64),1)
+    $(info WARNING - aarch64 variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=aarch64 instead)
+    TARGET_ARCH ?= aarch64
+endif
+ifeq ($(ppc64le),1)
+    $(info WARNING - ppc64le variable has been deprecated)
+    $(info WARNING - please use TARGET_ARCH=ppc64le instead)
+    TARGET_ARCH ?= ppc64le
+endif
+ifneq ($(GCC),)
+    $(info WARNING - GCC variable has been deprecated)
+    $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
+    HOST_COMPILER ?= $(GCC)
+endif
+ifneq ($(abi),)
+    $(error ERROR - abi variable has been removed)
+endif
+############################
+# end deprecated interface #
+############################
+
+# architecture
+HOST_ARCH   := $(shell uname -m)
+TARGET_ARCH ?= $(HOST_ARCH)
+ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
+    ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+        ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
+            TARGET_SIZE := 64
+        else ifneq (,$(filter $(TARGET_ARCH),armv7l))
+            TARGET_SIZE := 32
+        endif
+    else
+        TARGET_SIZE := $(shell getconf LONG_BIT)
+    endif
+else
+    $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
+endif
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
+        $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
+    endif
+endif
+
+# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
+ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
+    TARGET_ARCH = armv7l
+endif
+
+# operating system
+HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
+TARGET_OS ?= $(HOST_OS)
+ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
+    $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
+endif
+
+# host compiler
+ifeq ($(TARGET_OS),darwin)
+    ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
+        HOST_COMPILER ?= clang++
+    endif
+else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
+        ifeq ($(TARGET_OS),linux)
+            HOST_COMPILER ?= arm-linux-gnueabihf-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
+        else ifeq ($(TARGET_OS),android)
+            HOST_COMPILER ?= arm-linux-androideabi-g++
+        endif
+    else ifeq ($(TARGET_ARCH),aarch64)
+        ifeq ($(TARGET_OS), linux)
+            HOST_COMPILER ?= aarch64-linux-gnu-g++
+        else ifeq ($(TARGET_OS),qnx)
+            ifeq ($(QNX_HOST),)
+                $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
+            endif
+            ifeq ($(QNX_TARGET),)
+                $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
+            endif
+            export QNX_HOST
+            export QNX_TARGET
+            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
+        else ifeq ($(TARGET_OS), android)
+            HOST_COMPILER ?= aarch64-linux-android-clang++
+        endif
+    else ifeq ($(TARGET_ARCH),ppc64le)
+        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
+    endif
+endif
+HOST_COMPILER ?= g++
+NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
+
+# internal flags
+NVCCFLAGS   := -m${TARGET_SIZE}
+CCFLAGS     :=
+LDFLAGS     :=
+
+# build flags
+ifeq ($(TARGET_OS),darwin)
+    LDFLAGS += -rpath $(CUDA_PATH)/lib
+    CCFLAGS += -arch $(HOST_ARCH)
+else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
+    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
+    CCFLAGS += -mfloat-abi=hard
+else ifeq ($(TARGET_OS),android)
+    LDFLAGS += -pie
+    CCFLAGS += -fpie -fpic -fexceptions
+endif
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
+        endif
+    endif
+    ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+        ifneq ($(TARGET_FS),)
+            GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
+            ifeq ($(GCCVERSIONLTEQ46),1)
+                CCFLAGS += --sysroot=$(TARGET_FS)
+            endif
+            LDFLAGS += --sysroot=$(TARGET_FS)
+            LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
+            LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
+            LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include
+            CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
+        endif
+    endif
+endif
+
+ifeq ($(TARGET_OS),qnx)
+    CCFLAGS += -DWIN_INTERFACE_CUSTOM
+    LDFLAGS += -lsocket
+endif
+
+# Install directory of different arch
+CUDA_INSTALL_TARGET_DIR :=
+ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
+    CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
+else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
+    CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
+else ifeq ($(TARGET_ARCH),ppc64le)
+    CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
+endif
+
+# Debug build flags
+ifeq ($(dbg),1)
+      NVCCFLAGS += -g -G
+      BUILD_TYPE := debug
+else
+      BUILD_TYPE := release
+endif
+
+ALL_CCFLAGS :=
+ALL_CCFLAGS += $(NVCCFLAGS)
+ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
+ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
+
+SAMPLE_ENABLED := 1
+
+ALL_LDFLAGS :=
+ALL_LDFLAGS += $(ALL_CCFLAGS)
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
+ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
+
+# Common includes and paths for CUDA
+INCLUDES  := -I../../Common
+LIBRARIES :=
+
+################################################################################
+
+# Gencode arguments
+SMS ?= 30 35 37 50 52 60 61 70 75
+
+ifeq ($(SMS),)
+$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
+SAMPLE_ENABLED := 0
+endif
+
+ifeq ($(GENCODE_FLAGS),)
+# Generate SASS code for each SM architecture listed in $(SMS)
+$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
+
+# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
+HIGHEST_SM := $(lastword $(sort $(SMS)))
+ifneq ($(HIGHEST_SM),)
+GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
+endif
+endif
+
+ifeq ($(SAMPLE_ENABLED),0)
+EXEC ?= @echo "[@]"
+endif
+
+################################################################################
+
+# Target rules
+all: build
+
+build: p2pBandwidthLatencyTest
+
+check.deps:
+ifeq ($(SAMPLE_ENABLED),0)
+	@echo "Sample will be waived due to the above missing dependencies"
+else
+	@echo "Sample is ready - all dependencies have been met"
+endif
+
+p2pBandwidthLatencyTest.o:p2pBandwidthLatencyTest.cu
+	$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
+
+p2pBandwidthLatencyTest: p2pBandwidthLatencyTest.o
+	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
+	$(EXEC) mkdir -p ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+	$(EXEC) cp $@ ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
+
+run: build
+	$(EXEC) ./p2pBandwidthLatencyTest
+
+clean:
+	rm -f p2pBandwidthLatencyTest p2pBandwidthLatencyTest.o
+	rm -rf ../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/p2pBandwidthLatencyTest
+
+clobber: clean
--- a/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
+++ b/Samples/p2pBandwidthLatencyTest/NsightEclipse.xml
@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?> 
+<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
+<entry>
+  <name>p2pBandwidthLatencyTest</name>
+  <cuda_api_list>
+    <toolkit>cudaDeviceCanAccessPeer</toolkit>
+    <toolkit>cudaDeviceEnablePeerAccess</toolkit>
+    <toolkit>cudaDeviceDisablePeerAccess</toolkit>
+    <toolkit>cudaEventCreateWithFlags</toolkit>
+    <toolkit>cudaEventElapsedTime</toolkit>
+    <toolkit>cudaMemcpy</toolkit>
+  </cuda_api_list>
+  <description><![CDATA[This application demonstrates the CUDA Peer-To-Peer (P2P) data transfers between pairs of GPUs and computes latency and bandwidth.  Tests on GPU pairs using P2P and without P2P are tested.]]></description>
+  <devicecompilation>whole</devicecompilation>
+  <includepaths>
+    <path>./</path>
+    <path>../</path>
+    <path>../../common/inc</path>
+  </includepaths>
+  <keyconcepts>
+    <concept level="basic">Performance Strategies</concept>
+    <concept level="basic">Asynchronous Data Transfers</concept>
+    <concept level="basic">Unified Virtual Address Space</concept>
+    <concept level="basic">Peer to Peer Data Transfers</concept>
+    <concept level="basic">Multi-GPU</concept>
+  </keyconcepts>
+  <keywords>
+    <keyword>CUDA</keyword>
+    <keyword>Performance</keyword>
+    <keyword>multi-GPU support</keyword>
+    <keyword>peer to peer</keyword>
+  </keywords>
+  <libraries>
+  </libraries>
+  <librarypaths>
+  </librarypaths>
+  <nsight_eclipse>true</nsight_eclipse>
+  <primary_file>p2pBandwidthLatencyTest.cu</primary_file>
+  <scopes>
+    <scope>1:CUDA Basic Topics</scope>
+    <scope>1:Performance Strategies</scope>
+  </scopes>
+  <sm-arch>sm30</sm-arch>
+  <sm-arch>sm35</sm-arch>
+  <sm-arch>sm37</sm-arch>
+  <sm-arch>sm50</sm-arch>
+  <sm-arch>sm52</sm-arch>
+  <sm-arch>sm60</sm-arch>
+  <sm-arch>sm61</sm-arch>
+  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
+  <supported_envs>
+    <env>
+      <arch>x86_64</arch>
+      <platform>linux</platform>
+    </env>
+    <env>
+      <platform>windows7</platform>
+    </env>
+    <env>
+      <arch>x86_64</arch>
+      <platform>macosx</platform>
+    </env>
+    <env>
+      <arch>arm</arch>
+    </env>
+    <env>
+      <arch>ppc64le</arch>
+      <platform>linux</platform>
+    </env>
+  </supported_envs>
+  <supported_sm_architectures>
+    <include>all</include>
+  </supported_sm_architectures>
+  <title>Peer-to-Peer Bandwidth Latency Test with Multi-GPUs</title>
+  <type>exe</type>
+</entry>
--- a/Samples/p2pBandwidthLatencyTest/README.md
+++ b/Samples/p2pBandwidthLatencyTest/README.md
@ -0,0 +1,94 @@
+# p2pBandwidthLatencyTest - Peer-to-Peer Bandwidth Latency Test with Multi-GPUs
+
+## Description
+
+This application demonstrates the CUDA Peer-To-Peer (P2P) data transfers between pairs of GPUs and computes latency and bandwidth.  Tests on GPU pairs using P2P and without P2P are tested.
+
+## Key Concepts
+
+Performance Strategies, Asynchronous Data Transfers, Unified Virtual Address Space, Peer to Peer Data Transfers, Multi-GPU
+
+## Supported SM Architectures
+
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
+
+## Supported OSes
+
+Linux, Windows, MacOSX
+
+## Supported CPU Architecture
+
+x86_64, ppc64le, armv7l
+
+## CUDA APIs involved
+
+### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
+cudaDeviceCanAccessPeer, cudaDeviceEnablePeerAccess, cudaDeviceDisablePeerAccess, cudaEventCreateWithFlags, cudaEventElapsedTime, cudaMemcpy
+
+## Prerequisites
+
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+
+## Build and Run
+
+### Windows
+The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
+```
+*_vs<version>.sln - for Visual Studio <version>
+```
+Each individual sample has its own set of solution files in its directory:
+
+To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
+> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
+
+### Linux
+The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+The samples makefiles can take advantage of certain options:
+*  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
+    By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
+`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
+    See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
+*   **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+*   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
+    ```
+    $ make SMS="50 60"
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
+```
+    $ make HOST_COMPILER=g++
+```
+
+### Mac
+The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
+```
+$ cd <sample_dir>
+$ make
+```
+
+The samples makefiles can take advantage of certain options:
+
+*  **dbg=1** - build with debug symbols
+    ```
+    $ make dbg=1
+    ```
+
+*  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
+    ```
+    $ make SMS="A B ..."
+    ```
+
+*  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
+    ```
+    $ make HOST_COMPILER=clang
+    ```
+
+## References (for more details)
+
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@ -0,0 +1,682 @@
+/* Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cstdio>
+#include <vector>
+
+#include <helper_cuda.h>
+#include <helper_timer.h>
+
+using namespace std;
+
+const char *sSampleName = "P2P (Peer-to-Peer) GPU Bandwidth Latency Test";
+
+typedef enum {
+  P2P_WRITE = 0,
+  P2P_READ = 1,
+} P2PDataTransfer;
+
+typedef enum {
+  CE = 0,
+  SM = 1,
+} P2PEngine;
+
+P2PEngine p2p_mechanism = CE;  // By default use Copy Engine
+
+// Macro for checking cuda errors following a cuda launch or api call
+#define cudaCheckError()                                       \
+  {                                                            \
+    cudaError_t e = cudaGetLastError();                        \
+    if (e != cudaSuccess) {                                    \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                           \
+      exit(EXIT_FAILURE);                                      \
+    }                                                          \
+  }
+__global__ void delay(volatile int *flag,
+                      unsigned long long timeout_clocks = 10000000) {
+  // Wait until the application notifies us that it has completed queuing up the
+  // experiment, or timeout and exit, allowing the application to make progress
+  long long int start_clock, sample_clock;
+  start_clock = clock64();
+
+  while (!*flag) {
+    sample_clock = clock64();
+
+    if (sample_clock - start_clock > timeout_clocks) {
+      break;
+    }
+  }
+}
+
+// This kernel is for demonstration purposes only, not a performant kernel for
+// p2p transfers.
+__global__ void copyp2p(int4 *__restrict__ dest, int4 const *__restrict__ src,
+                        size_t num_elems) {
+  size_t globalId = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t gridSize = blockDim.x * gridDim.x;
+
+#pragma unroll(5)
+  for (size_t i = globalId; i < num_elems; i += gridSize) {
+    dest[i] = src[i];
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////
+// Print help screen
+///////////////////////////////////////////////////////////////////////////
+void printHelp(void) {
+  printf("Usage:  p2pBandwidthLatencyTest [OPTION]...\n");
+  printf("Tests bandwidth/latency of GPU pairs using P2P and without P2P\n");
+  printf("\n");
+
+  printf("Options:\n");
+  printf("--help\t\tDisplay this help menu\n");
+  printf(
+      "--p2p_read\tUse P2P reads for data transfers between GPU pairs and show "
+      "corresponding results.\n \t\tDefault used is P2P write operation.\n");
+  printf("--sm_copy\tUse SM intiated p2p transfers instead of Copy Engine\n");
+}
+
+void checkP2Paccess(int numGPUs) {
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+    cudaCheckError();
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access;
+      if (i != j) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        cudaCheckError();
+        printf("Device=%d %s Access Peer Device=%d\n", i,
+               access ? "CAN" : "CANNOT", j);
+      }
+    }
+  }
+  printf(
+      "\n***NOTE: In case a device doesn't have P2P access to other one, it "
+      "falls back to normal memcopy procedure.\nSo you can see lesser "
+      "Bandwidth (GB/s) and unstable Latency (us) in those cases.\n\n");
+}
+
+void performP2PCopy(int *dest, int destDevice, int *src, int srcDevice,
+                    int num_elems, int repeat, bool p2paccess,
+                    cudaStream_t streamToRun) {
+  int blockSize = 0;
+  int numBlocks = 0;
+
+  cudaOccupancyMaxPotentialBlockSize(&numBlocks, &blockSize, copyp2p);
+  cudaCheckError();
+
+  if (p2p_mechanism == SM && p2paccess) {
+    for (int r = 0; r < repeat; r++) {
+      copyp2p<<<numBlocks, blockSize, 0, streamToRun>>>(
+          (int4 *)dest, (int4 *)src, num_elems / 4);
+    }
+  } else {
+    for (int r = 0; r < repeat; r++) {
+      cudaMemcpyPeerAsync(dest, destDevice, src, srcDevice,
+                          sizeof(int) * num_elems, streamToRun);
+    }
+  }
+}
+
+void outputBandwidthMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
+  int numElems = 10000000;
+  int repeat = 5;
+  volatile int *flag = NULL;
+  vector<int *> buffers(numGPUs);
+  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
+  vector<cudaEvent_t> start(numGPUs);
+  vector<cudaEvent_t> stop(numGPUs);
+  vector<cudaStream_t> stream(numGPUs);
+
+  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
+  cudaCheckError();
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
+    cudaMalloc(&buffers[d], numElems * sizeof(int));
+    cudaCheckError();
+    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
+    cudaCheckError();
+    cudaEventCreate(&start[d]);
+    cudaCheckError();
+    cudaEventCreate(&stop[d]);
+    cudaCheckError();
+  }
+
+  vector<double> bandwidthMatrix(numGPUs * numGPUs);
+
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access = 0;
+      if (p2p) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        if (access) {
+          cudaDeviceEnablePeerAccess(j, 0);
+          cudaCheckError();
+          cudaSetDevice(j);
+          cudaCheckError();
+          cudaDeviceEnablePeerAccess(i, 0);
+          cudaCheckError();
+          cudaSetDevice(i);
+          cudaCheckError();
+        }
+      }
+
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      // Block the stream until all the work is queued up
+      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
+      // room to push the operation, so keep the number of repeatitions
+      // relatively low.  Higher repeatitions will cause the delay kernel
+      // to timeout and lead to unstable results.
+      *flag = 0;
+      delay<<<1, 1, 0, stream[i]>>>(flag);
+      cudaCheckError();
+      cudaEventRecord(start[i], stream[i]);
+      cudaCheckError();
+
+      if (i == j) {
+        // Perform intra-GPU, D2D copies
+        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
+                       access, stream[i]);
+
+      } else {
+        if (p2p_method == P2P_WRITE) {
+          performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
+                         stream[i]);
+        } else {
+          performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
+                         stream[i]);
+        }
+      }
+
+      cudaEventRecord(stop[i], stream[i]);
+      cudaCheckError();
+
+      // Release the queued events
+      *flag = 1;
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      float time_ms;
+      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
+      double time_s = time_ms / 1e3;
+
+      double gb = numElems * sizeof(int) * repeat / (double)1e9;
+      if (i == j) {
+        gb *= 2;  // must count both the read and the write here
+      }
+      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
+      if (p2p && access) {
+        cudaDeviceDisablePeerAccess(j);
+        cudaSetDevice(j);
+        cudaDeviceDisablePeerAccess(i);
+        cudaSetDevice(i);
+        cudaCheckError();
+      }
+    }
+  }
+
+  printf("   D\\D");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaFree(buffers[d]);
+    cudaFree(buffersD2D[d]);
+    cudaCheckError();
+    cudaEventDestroy(start[d]);
+    cudaCheckError();
+    cudaEventDestroy(stop[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream[d]);
+    cudaCheckError();
+  }
+
+  cudaFreeHost((void *)flag);
+  cudaCheckError();
+}
+
+void outputBidirectionalBandwidthMatrix(int numGPUs, bool p2p) {
+  int numElems = 10000000;
+  int repeat = 5;
+  volatile int *flag = NULL;
+  vector<int *> buffers(numGPUs);
+  vector<int *> buffersD2D(numGPUs);
+  vector<cudaEvent_t> start(numGPUs);
+  vector<cudaEvent_t> stop(numGPUs);
+  vector<cudaStream_t> stream0(numGPUs);
+  vector<cudaStream_t> stream1(numGPUs);
+
+  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
+  cudaCheckError();
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaMalloc(&buffers[d], numElems * sizeof(int));
+    cudaMalloc(&buffersD2D[d], numElems * sizeof(int));
+    cudaCheckError();
+    cudaEventCreate(&start[d]);
+    cudaCheckError();
+    cudaEventCreate(&stop[d]);
+    cudaCheckError();
+    cudaStreamCreateWithFlags(&stream0[d], cudaStreamNonBlocking);
+    cudaCheckError();
+    cudaStreamCreateWithFlags(&stream1[d], cudaStreamNonBlocking);
+    cudaCheckError();
+  }
+
+  vector<double> bandwidthMatrix(numGPUs * numGPUs);
+
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access = 0;
+      if (p2p) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        if (access) {
+          cudaSetDevice(i);
+          cudaDeviceEnablePeerAccess(j, 0);
+          cudaCheckError();
+          cudaSetDevice(j);
+          cudaDeviceEnablePeerAccess(i, 0);
+          cudaCheckError();
+        }
+      }
+
+      cudaSetDevice(i);
+      cudaStreamSynchronize(stream0[i]);
+      cudaStreamSynchronize(stream1[j]);
+      cudaCheckError();
+
+      // Block the stream until all the work is queued up
+      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
+      // room to push the operation, so keep the number of repeatitions
+      // relatively low.  Higher repeatitions will cause the delay kernel
+      // to timeout and lead to unstable results.
+      *flag = 0;
+      cudaSetDevice(i);
+      // No need to block stream1 since it'll be blocked on stream0's event
+      delay<<<1, 1, 0, stream0[i]>>>(flag);
+      cudaCheckError();
+
+      // Force stream1 not to start until stream0 does, in order to ensure
+      // the events on stream0 fully encompass the time needed for all
+      // operations
+      cudaEventRecord(start[i], stream0[i]);
+      cudaStreamWaitEvent(stream1[j], start[i], 0);
+
+      if (i == j) {
+        // For intra-GPU perform 2 memcopies buffersD2D <-> buffers
+        performP2PCopy(buffers[i], i, buffersD2D[i], i, numElems, repeat,
+                       access, stream0[i]);
+        performP2PCopy(buffersD2D[i], i, buffers[i], i, numElems, repeat,
+                       access, stream1[i]);
+      } else {
+        if (access && p2p_mechanism == SM) {
+          cudaSetDevice(j);
+        }
+        performP2PCopy(buffers[i], i, buffers[j], j, numElems, repeat, access,
+                       stream1[j]);
+        if (access && p2p_mechanism == SM) {
+          cudaSetDevice(i);
+        }
+        performP2PCopy(buffers[j], j, buffers[i], i, numElems, repeat, access,
+                       stream0[i]);
+      }
+
+      // Notify stream0 that stream1 is complete and record the time of
+      // the total transaction
+      cudaEventRecord(stop[j], stream1[j]);
+      cudaStreamWaitEvent(stream0[i], stop[j], 0);
+      cudaEventRecord(stop[i], stream0[i]);
+
+      // Release the queued operations
+      *flag = 1;
+      cudaStreamSynchronize(stream0[i]);
+      cudaStreamSynchronize(stream1[j]);
+      cudaCheckError();
+
+      float time_ms;
+      cudaEventElapsedTime(&time_ms, start[i], stop[i]);
+      double time_s = time_ms / 1e3;
+
+      double gb = 2.0 * numElems * sizeof(int) * repeat / (double)1e9;
+      if (i == j) {
+        gb *= 2;  // must count both the read and the write here
+      }
+      bandwidthMatrix[i * numGPUs + j] = gb / time_s;
+      if (p2p && access) {
+        cudaSetDevice(i);
+        cudaDeviceDisablePeerAccess(j);
+        cudaSetDevice(j);
+        cudaDeviceDisablePeerAccess(i);
+      }
+    }
+  }
+
+  printf("   D\\D");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", bandwidthMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaFree(buffers[d]);
+    cudaFree(buffersD2D[d]);
+    cudaCheckError();
+    cudaEventDestroy(start[d]);
+    cudaCheckError();
+    cudaEventDestroy(stop[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream0[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream1[d]);
+    cudaCheckError();
+  }
+
+  cudaFreeHost((void *)flag);
+  cudaCheckError();
+}
+
+void outputLatencyMatrix(int numGPUs, bool p2p, P2PDataTransfer p2p_method) {
+  int repeat = 100;
+  volatile int *flag = NULL;
+  StopWatchInterface *stopWatch = NULL;
+  vector<int *> buffers(numGPUs);
+  vector<int *> buffersD2D(numGPUs);  // buffer for D2D, that is, intra-GPU copy
+  vector<cudaStream_t> stream(numGPUs);
+  vector<cudaEvent_t> start(numGPUs);
+  vector<cudaEvent_t> stop(numGPUs);
+
+  cudaHostAlloc((void **)&flag, sizeof(*flag), cudaHostAllocPortable);
+  cudaCheckError();
+
+  if (!sdkCreateTimer(&stopWatch)) {
+    printf("Failed to create stop watch\n");
+    exit(EXIT_FAILURE);
+  }
+  sdkStartTimer(&stopWatch);
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaStreamCreateWithFlags(&stream[d], cudaStreamNonBlocking);
+    cudaMalloc(&buffers[d], sizeof(int));
+    cudaMalloc(&buffersD2D[d], sizeof(int));
+    cudaCheckError();
+    cudaEventCreate(&start[d]);
+    cudaCheckError();
+    cudaEventCreate(&stop[d]);
+    cudaCheckError();
+  }
+
+  vector<double> gpuLatencyMatrix(numGPUs * numGPUs);
+  vector<double> cpuLatencyMatrix(numGPUs * numGPUs);
+
+  for (int i = 0; i < numGPUs; i++) {
+    cudaSetDevice(i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      int access = 0;
+      if (p2p) {
+        cudaDeviceCanAccessPeer(&access, i, j);
+        if (access) {
+          cudaDeviceEnablePeerAccess(j, 0);
+          cudaCheckError();
+          cudaSetDevice(j);
+          cudaDeviceEnablePeerAccess(i, 0);
+          cudaSetDevice(i);
+          cudaCheckError();
+        }
+      }
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      // Block the stream until all the work is queued up
+      // DANGER! - cudaMemcpy*Async may infinitely block waiting for
+      // room to push the operation, so keep the number of repeatitions
+      // relatively low.  Higher repeatitions will cause the delay kernel
+      // to timeout and lead to unstable results.
+      *flag = 0;
+      delay<<<1, 1, 0, stream[i]>>>(flag);
+      cudaCheckError();
+      cudaEventRecord(start[i], stream[i]);
+
+      sdkResetTimer(&stopWatch);
+      if (i == j) {
+        // Perform intra-GPU, D2D copies
+        performP2PCopy(buffers[i], i, buffersD2D[i], i, 1, repeat, access,
+                       stream[i]);
+      } else {
+        if (p2p_method == P2P_WRITE) {
+          performP2PCopy(buffers[j], j, buffers[i], i, 1, repeat, access,
+                         stream[i]);
+        } else {
+          performP2PCopy(buffers[i], i, buffers[j], j, 1, repeat, access,
+                         stream[i]);
+        }
+      }
+      float cpu_time_ms = sdkGetTimerValue(&stopWatch);
+
+      cudaEventRecord(stop[i], stream[i]);
+      // Now that the work has been queued up, release the stream
+      *flag = 1;
+      cudaStreamSynchronize(stream[i]);
+      cudaCheckError();
+
+      float gpu_time_ms;
+      cudaEventElapsedTime(&gpu_time_ms, start[i], stop[i]);
+
+      gpuLatencyMatrix[i * numGPUs + j] = gpu_time_ms * 1e3 / repeat;
+      cpuLatencyMatrix[i * numGPUs + j] = cpu_time_ms * 1e3 / repeat;
+      if (p2p && access) {
+        cudaDeviceDisablePeerAccess(j);
+        cudaSetDevice(j);
+        cudaDeviceDisablePeerAccess(i);
+        cudaSetDevice(i);
+        cudaCheckError();
+      }
+    }
+  }
+
+  printf("   GPU");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", gpuLatencyMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  printf("\n   CPU");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d ", j);
+  }
+
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d ", i);
+
+    for (int j = 0; j < numGPUs; j++) {
+      printf("%6.02f ", cpuLatencyMatrix[i * numGPUs + j]);
+    }
+
+    printf("\n");
+  }
+
+  for (int d = 0; d < numGPUs; d++) {
+    cudaSetDevice(d);
+    cudaFree(buffers[d]);
+    cudaFree(buffersD2D[d]);
+    cudaCheckError();
+    cudaEventDestroy(start[d]);
+    cudaCheckError();
+    cudaEventDestroy(stop[d]);
+    cudaCheckError();
+    cudaStreamDestroy(stream[d]);
+    cudaCheckError();
+  }
+
+  sdkDeleteTimer(&stopWatch);
+
+  cudaFreeHost((void *)flag);
+  cudaCheckError();
+}
+
+int main(int argc, char **argv) {
+  int numGPUs;
+  P2PDataTransfer p2p_method = P2P_WRITE;
+
+  cudaGetDeviceCount(&numGPUs);
+  cudaCheckError();
+
+  // process command line args
+  if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+    printHelp();
+    return 0;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "p2p_read")) {
+    p2p_method = P2P_READ;
+  }
+
+  if (checkCmdLineFlag(argc, (const char **)argv, "sm_copy")) {
+    p2p_mechanism = SM;
+  }
+
+  printf("[%s]\n", sSampleName);
+
+  // output devices
+  for (int i = 0; i < numGPUs; i++) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, i);
+    cudaCheckError();
+    printf("Device: %d, %s, pciBusID: %x, pciDeviceID: %x, pciDomainID:%x\n", i,
+           prop.name, prop.pciBusID, prop.pciDeviceID, prop.pciDomainID);
+  }
+
+  checkP2Paccess(numGPUs);
+
+  // Check peer-to-peer connectivity
+  printf("P2P Connectivity Matrix\n");
+  printf("     D\\D");
+
+  for (int j = 0; j < numGPUs; j++) {
+    printf("%6d", j);
+  }
+  printf("\n");
+
+  for (int i = 0; i < numGPUs; i++) {
+    printf("%6d\t", i);
+    for (int j = 0; j < numGPUs; j++) {
+      if (i != j) {
+        int access;
+        cudaDeviceCanAccessPeer(&access, i, j);
+        cudaCheckError();
+        printf("%6d", (access) ? 1 : 0);
+      } else {
+        printf("%6d", 1);
+      }
+    }
+    printf("\n");
+  }
+
+  printf("Unidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
+  outputBandwidthMatrix(numGPUs, false, P2P_WRITE);
+  printf("Unidirectional P2P=Enabled Bandwidth (P2P Writes) Matrix (GB/s)\n");
+  outputBandwidthMatrix(numGPUs, true, P2P_WRITE);
+  if (p2p_method == P2P_READ) {
+    printf("Unidirectional P2P=Enabled Bandwidth (P2P Reads) Matrix (GB/s)\n");
+    outputBandwidthMatrix(numGPUs, true, p2p_method);
+  }
+  printf("Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)\n");
+  outputBidirectionalBandwidthMatrix(numGPUs, false);
+  printf("Bidirectional P2P=Enabled Bandwidth Matrix (GB/s)\n");
+  outputBidirectionalBandwidthMatrix(numGPUs, true);
+
+  printf("P2P=Disabled Latency Matrix (us)\n");
+  outputLatencyMatrix(numGPUs, false, P2P_WRITE);
+  printf("P2P=Enabled Latency (P2P Writes) Matrix (us)\n");
+  outputLatencyMatrix(numGPUs, true, P2P_WRITE);
+  if (p2p_method == P2P_READ) {
+    printf("P2P=Enabled Latency (P2P Reads) Matrix (us)\n");
+    outputLatencyMatrix(numGPUs, true, p2p_method);
+  }
+
+  printf(
+      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
+      "Results may vary when GPU Boost is enabled.\n");
+
+  exit(EXIT_SUCCESS);
+}
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.sln
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.sln
@ -1,7 +1,7 @@

-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warpAggregatedAtomicsCG", "warpAggregatedAtomicsCG_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2012
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2012.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2012.vcxproj
@ -15,14 +15,15 @@
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
-    <RootNamespace>simpleVoteIntrinsics_vs2010</RootNamespace>
-    <ProjectName>simpleVoteIntrinsics</ProjectName>
+    <RootNamespace>p2pBandwidthLatencyTest_vs2012</RootNamespace>
+    <ProjectName>p2pBandwidthLatencyTest</ProjectName>
    <CudaToolkitCustomDir />
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup>
    <ConfigurationType>Application</ConfigurationType>
    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v110</PlatformToolset>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
    <UseDebugLibraries>true</UseDebugLibraries>
@ -32,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -58,10 +59,10 @@
      <SubSystem>Console</SubSystem>
      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
-      <OutputFile>$(OutDir)/simpleVoteIntrinsics.exe</OutputFile>
+      <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -96,11 +97,11 @@
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <CudaCompile Include="simpleVoteIntrinsics.cu" />
-    <None Include="simpleVote_kernel.cuh" />
+    <CudaCompile Include="p2pBandwidthLatencyTest.cu" />
+
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.sln
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.sln
@ -1,7 +1,7 @@

-Microsoft Visual Studio Solution File, Format Version 11.00
-# Visual Studio 2010
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cudaTensorCoreGemm", "cudaTensorCoreGemm_vs2010.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+Microsoft Visual Studio Solution File, Format Version 13.00
+# Visual Studio 2013
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2013.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2013.vcxproj
@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>p2pBandwidthLatencyTest_vs2013</RootNamespace>
+    <ProjectName>p2pBandwidthLatencyTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v120</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="p2pBandwidthLatencyTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.sln
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 14.00
+# Visual Studio 2015
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2015.vcxproj
@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>p2pBandwidthLatencyTest_vs2015</RootNamespace>
+    <ProjectName>p2pBandwidthLatencyTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v140</PlatformToolset>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="p2pBandwidthLatencyTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.sln
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.sln
@ -0,0 +1,20 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2017
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "p2pBandwidthLatencyTest", "p2pBandwidthLatencyTest_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
+		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
+++ b/Samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest_vs2017.vcxproj
@ -0,0 +1,108 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
+  </PropertyGroup>
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
+    <RootNamespace>p2pBandwidthLatencyTest_vs2017</RootNamespace>
+    <ProjectName>p2pBandwidthLatencyTest</ProjectName>
+    <CudaToolkitCustomDir />
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup>
+    <ConfigurationType>Application</ConfigurationType>
+    <CharacterSet>MultiByte</CharacterSet>
+    <PlatformToolset>v141</PlatformToolset>
+	<WindowsTargetPlatformVersion>10.0.15063.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
+    <UseDebugLibraries>true</UseDebugLibraries>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)'=='Release'">
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets">
+    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <IntDir>$(Platform)/$(Configuration)/</IntDir>
+    <IncludePath>$(IncludePath)</IncludePath>
+    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
+    <CodeAnalysisRules />
+    <CodeAnalysisRuleAssemblies />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Platform)'=='x64'">
+    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
+      <OutputFile>$(OutDir)/p2pBandwidthLatencyTest.exe</OutputFile>
+    </Link>
+    <CudaCompile>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
+      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
+      <Include>./;../../Common</Include>
+      <Defines>WIN32</Defines>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MTd</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>false</GenerateDebugInformation>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+    </Link>
+    <CudaCompile>
+      <Runtime>MT</Runtime>
+      <TargetMachinePlatform>64</TargetMachinePlatform>
+    </CudaCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <CudaCompile Include="p2pBandwidthLatencyTest.cu" />
+
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
+  </ImportGroup>
+</Project>
--- a/Samples/shfl_scan/Makefile
+++ b/Samples/shfl_scan/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
@ -248,7 +246,7 @@ LIBRARIES :=
 ################################################################################

 # Gencode arguments
-SMS ?= 30 35 37 50 52 60 61 70
+SMS ?= 30 35 37 50 52 60 61 70 75

 ifeq ($(SMS),)
 $(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
--- a/Samples/shfl_scan/NsightEclipse.xml
+++ b/Samples/shfl_scan/NsightEclipse.xml
@ -42,6 +42,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
--- a/Samples/shfl_scan/README.md
+++ b/Samples/shfl_scan/README.md
@ -10,7 +10,7 @@ Data-Parallel Algorithms, Performance Strategies

 ## Supported SM Architectures

-[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)
+[SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)

 ## Supported OSes

@ -24,7 +24,7 @@ x86_64, ppc64le, armv7l, aarch64

 ## Prerequisites

-Download and install the [CUDA Toolkit 9.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
+Download and install the [CUDA Toolkit 10.0](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.

 ## Build and Run

--- a/Samples/shfl_scan/shfl_scan_vs2010.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2010.vcxproj
@ -1,107 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <PropertyGroup>
-    <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
-  </PropertyGroup>
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
-    <RootNamespace>shfl_scan_vs2010</RootNamespace>
-    <ProjectName>shfl_scan</ProjectName>
-    <CudaToolkitCustomDir />
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup>
-    <ConfigurationType>Application</ConfigurationType>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)'=='Debug'">
-    <UseDebugLibraries>true</UseDebugLibraries>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)'=='Release'">
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
-  </ImportGroup>
-  <ImportGroup Label="PropertySheets">
-    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup>
-    <IntDir>$(Platform)/$(Configuration)/</IntDir>
-    <IncludePath>$(IncludePath)</IncludePath>
-    <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
-    <CodeAnalysisRules />
-    <CodeAnalysisRuleAssemblies />
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Platform)'=='x64'">
-    <OutDir>../../bin/win64/$(Configuration)/</OutDir>
-  </PropertyGroup>
-  <ItemDefinitionGroup>
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
-    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
-      <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
-    </Link>
-    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
-      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
-      <Include>./;../../Common</Include>
-      <Defines>WIN32</Defines>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-      <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
-    </Link>
-    <CudaCompile>
-      <Runtime>MTd</Runtime>
-      <TargetMachinePlatform>64</TargetMachinePlatform>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
-    <ClCompile>
-      <Optimization>MaxSpeed</Optimization>
-      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
-    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>false</GenerateDebugInformation>
-      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
-    </Link>
-    <CudaCompile>
-      <Runtime>MT</Runtime>
-      <TargetMachinePlatform>64</TargetMachinePlatform>
-    </CudaCompile>
-  </ItemDefinitionGroup>
-  <ItemGroup>
-    <CudaCompile Include="shfl_scan.cu" />
-    <ClInclude Include="util.h" />
-    <None Include="shfl_integral_image.cuh" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
-  </ImportGroup>
-</Project>
--- a/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2012.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2013.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2015.vcxproj
@ -33,7 +33,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -62,7 +62,7 @@
      <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -103,6 +103,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
+++ b/Samples/shfl_scan/shfl_scan_vs2017.vcxproj
@ -34,7 +34,7 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.props" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets">
    <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
@ -63,7 +63,7 @@
      <OutputFile>$(OutDir)/shfl_scan.exe</OutputFile>
    </Link>
    <CudaCompile>
-      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;</CodeGeneration>
      <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
      <Include>./;../../Common</Include>
      <Defines>WIN32</Defines>
@ -104,6 +104,6 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(CUDAPropsPath)\CUDA 9.2.targets" />
+    <Import Project="$(CUDAPropsPath)\CUDA 10.0.targets" />
  </ImportGroup>
 </Project>
--- a/Samples/simpleCUBLAS/Makefile
+++ b/Samples/simpleCUBLAS/Makefile
@ -1,31 +1,29 @@
 ################################################################################
+# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 #
-# Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
 #
-# NOTICE TO USER:
-#
-# This source code is subject to NVIDIA ownership rights under U.S. and
-# international Copyright laws.
-#
-# NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
-# CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
-# IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
-# REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
-# IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
-# OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
-# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
-# OR PERFORMANCE OF THIS SOURCE CODE.
-#
-# U.S. Government End Users.  This source code is a "commercial item" as
-# that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
-# "commercial computer software" and "commercial computer software
-# documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
-# and is provided to the U.S. Government only as a commercial end item.
-# Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
-# 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
-# source code with only those rights set forth herein.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 ################################################################################
 #
@ -141,7 +139,7 @@ else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
            export QNX_TARGET
            HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
        else ifeq ($(TARGET_OS), android)
-            HOST_COMPILER ?= aarch64-linux-android-g++
+            HOST_COMPILER ?= aarch64-linux-android-clang++
        endif
    else ifeq ($(TARGET_ARCH),ppc64le)
        HOST_COMPILER ?= powerpc64le-linux-gnu-g++
--- a/Samples/simpleCUBLAS/NsightEclipse.xml
+++ b/Samples/simpleCUBLAS/NsightEclipse.xml
@ -41,6 +41,7 @@
  <sm-arch>sm60</sm-arch>
  <sm-arch>sm61</sm-arch>
  <sm-arch>sm70</sm-arch>
+  <sm-arch>sm75</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
--- a/Show More
+++ b/Show More