update vulkan samples with SPIR-V shaders

2025-07-29 17:23:15 +08:00 · 2021-06-02 17:17:21 +05:30 · 2021-06-02 17:17:21 +05:30 · 7a5b3e6c8c
commit 7a5b3e6c8c
parent 5c3ec60fae
17 changed files with 2368 additions and 2116 deletions
--- a/Samples/simpleVulkan/Build_instructions.txt
+++ b/Samples/simpleVulkan/Build_instructions.txt
@ -19,8 +19,17 @@ For Linux:
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH

+
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+
+
+For Shader changes:
+-- Update the sinewave.vert and/or sinewave.frag shader source file as required
+-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
+    glslc sinewave.vert -o vert.spv
+    glslc sinewave.frag -o frag.spv
+** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/simpleVulkan/frag.spv
+++ b/Samples/simpleVulkan/frag.spv
--- a/Samples/simpleVulkan/main.cpp
+++ b/Samples/simpleVulkan/main.cpp
@ -92,9 +92,9 @@ class VulkanCudaSineWave : public VulkanBaseApp {
    }
    // Add our compiled vulkan shader files
    char *vertex_shader_path =
-        sdkFindFilePath("sinewave.vert", execution_path.c_str());
+        sdkFindFilePath("vert.spv", execution_path.c_str());
    char *fragment_shader_path =
-        sdkFindFilePath("sinewave.frag", execution_path.c_str());
+        sdkFindFilePath("frag.spv", execution_path.c_str());
    m_shaderFiles.push_back(
        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
    m_shaderFiles.push_back(
--- a/Samples/simpleVulkan/vert.spv
+++ b/Samples/simpleVulkan/vert.spv
--- a/Samples/simpleVulkanMMAP/Build_instructions.txt
+++ b/Samples/simpleVulkanMMAP/Build_instructions.txt
@ -0,0 +1,35 @@
+For Windows:
+Follow these steps once you have installed Vulkan SDK for Windows from https://www.lunarg.com/vulkan-sdk/
+-- Install GLFW3 library at suitable location
+-- Open the simpleVulkan VS project file.
+To add the GLFW3 library path
+-- Right click on Project name "simpleVulkan" click on "Properties"
+-- In Property pages window go to Linker -> General. Here in "Additional Libraries Directories" edit and add path to glfw3dll.lib
+To add the GLFW3 headers path
+-- Right click on Project name "simpleVulkan" click on "Properties"
+-- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location.
+** Make sure to add path to glfw3.dll in your PATH environment variable**
+
+
+For Linux:
+-- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/  and follow environment setup instructions.
+-- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS. Below is for Ubuntu:
+    sudo apt-get install libglfw3
+    sudo apt-get install libglfw3-dev
+-- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
+-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
+
+
+For Linux aarch64(L4T):
+-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
+-- install above will also provide libvulkan-dev as dependencies
+-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
+-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+
+
+For Shader changes:
+-- Update the montecarlo.vert and/or montecarlo.frag shader source file as required
+-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
+    glslc montecarlo.vert -o vert.spv
+    glslc montecarlo.frag -o frag.spv
+** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/simpleVulkanMMAP/MonteCarloPi.cu
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.cu
@ -25,9 +25,9 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

- /*
-  * See: https://www.piday.org/million/
-  */
+/*
+ * See: https://www.piday.org/million/
+ */

 #include "MonteCarloPi.h"
 #include <algorithm>
@ -37,15 +37,16 @@

 #define ROUND_UP_TO_GRANULARITY(x, n) (((x + n - 1) / n) * n)

-  // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
-  // uses for importing and exporting memory allocation. On Linux this sample
-  // specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
-  // file descriptors will be used. On Windows this sample specifies the type as
-  // CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
-  // ipcHandleTypeFlag variable is a convenience variable and is passed by value
-  // to individual requests.
+// `ipcHandleTypeFlag` specifies the platform specific handle type this sample
+// uses for importing and exporting memory allocation. On Linux this sample
+// specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
+// file descriptors will be used. On Windows this sample specifies the type as
+// CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
+// ipcHandleTypeFlag variable is a convenience variable and is passed by value
+// to individual requests.
 #if defined(__linux__)
-CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+CUmemAllocationHandleType ipcHandleTypeFlag =
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #else
 CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
 #endif
@ -53,223 +54,248 @@ CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
 // Windows-specific LPSECURITYATTRIBUTES
 void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) {
 #if defined(__linux__)
-    return;
+  return;
 #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)";
-    static OBJECT_ATTRIBUTES objAttributes;
-    static bool objAttributesConfigured = false;
+  static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)";
+  static OBJECT_ATTRIBUTES objAttributes;
+  static bool objAttributesConfigured = false;

-    if (!objAttributesConfigured) {
-        PSECURITY_DESCRIPTOR secDesc;
-        BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(
-            sddl, SDDL_REVISION_1, &secDesc, NULL);
-        if (result == 0) {
-            printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n",
-                GetLastError());
-        }
-
-        InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc);
-
-        objAttributesConfigured = true;
+  if (!objAttributesConfigured) {
+    PSECURITY_DESCRIPTOR secDesc;
+    BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(
+        sddl, SDDL_REVISION_1, &secDesc, NULL);
+    if (result == 0) {
+      printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n",
+             GetLastError());
    }

-    prop->win32HandleMetaData = &objAttributes;
-    return;
+    InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc);
+
+    objAttributesConfigured = true;
+  }
+
+  prop->win32HandleMetaData = &objAttributes;
+  return;
 #endif
 }

-__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, float *numPointsInCircle, unsigned int numPoints, float time)
-{
-    const size_t stride = gridDim.x * blockDim.x;
-    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
-    float count = 0.0f;
+__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle,
+                                   float *numPointsInCircle,
+                                   unsigned int numPoints, float time) {
+  const size_t stride = gridDim.x * blockDim.x;
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  float count = 0.0f;

-    curandState rgnState;
-    curand_init((unsigned long long)time, tid, 0, &rgnState);
+  curandState rgnState;
+  curand_init((unsigned long long)time, tid, 0, &rgnState);

-    for (; tid < numPoints; tid += stride) {
-        float x = curand_uniform(&rgnState);
-        float y = curand_uniform(&rgnState);
-        x = (2.0f * x) - 1.0f;
-        y = (2.0f * y) - 1.0f;
-        xyVector[tid][0] = x;
-        xyVector[tid][1] = y;
+  for (; tid < numPoints; tid += stride) {
+    float x = curand_uniform(&rgnState);
+    float y = curand_uniform(&rgnState);
+    x = (2.0f * x) - 1.0f;
+    y = (2.0f * y) - 1.0f;
+    xyVector[tid][0] = x;
+    xyVector[tid][1] = y;

-        // Compute the distance of this point form the center(0, 0)
-        float dist = sqrtf((x*x) + (y*y));
+    // Compute the distance of this point form the center(0, 0)
+    float dist = sqrtf((x * x) + (y * y));

-        // If distance is less than the radius of the unit circle, the point lies in the circle.
-        pointsInsideCircle[tid] = (dist <= 1.0f);
-        count += (dist <= 1.0f);
-    }
-    atomicAdd(numPointsInCircle, count);
+    // If distance is less than the radius of the unit circle, the point lies in
+    // the circle.
+    pointsInsideCircle[tid] = (dist <= 1.0f);
+    count += (dist <= 1.0f);
+  }
+  atomicAdd(numPointsInCircle, count);
 }

-MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points) :
-    m_xyVector(nullptr),
-    m_pointsInsideCircle(nullptr),
-    m_totalPointsInsideCircle(0),
-    m_totalPointsSimulated(0),
-    m_numPoints(num_points)
-{
+MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points)
+    : m_xyVector(nullptr),
+      m_pointsInsideCircle(nullptr),
+      m_totalPointsInsideCircle(0),
+      m_totalPointsSimulated(0),
+      m_numPoints(num_points) {}
+
+MonteCarloPiSimulation::~MonteCarloPiSimulation() {
+  if (m_numPointsInCircle) {
+    checkCudaErrors(cudaFree(m_numPointsInCircle));
+    m_numPointsInCircle = nullptr;
+  }
+  if (m_hostNumPointsInCircle) {
+    checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle));
+    m_hostNumPointsInCircle = nullptr;
+  }
+
+  cleanupSimulationAllocations();
 }

-MonteCarloPiSimulation::~MonteCarloPiSimulation()
-{
-    if (m_numPointsInCircle) {
-        checkCudaErrors(cudaFree(m_numPointsInCircle));
-        m_numPointsInCircle = nullptr;
-    }
-    if (m_hostNumPointsInCircle) {
-        checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle));
-        m_hostNumPointsInCircle = nullptr;
-    }
+void MonteCarloPiSimulation::initSimulation(int cudaDevice,
+                                            cudaStream_t stream) {
+  m_cudaDevice = cudaDevice;
+  getIdealExecutionConfiguration();

-    cleanupSimulationAllocations();
+  // Allocate a position buffer that contains random location of the points in
+  // XY cartesian plane.
+  // Allocate a bitmap buffer which holds information of whether a point in the
+  // position buffer is inside the unit circle or not.
+  setupSimulationAllocations();
+
+  checkCudaErrors(
+      cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
+  checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle,
+                                 sizeof(*m_hostNumPointsInCircle)));
 }

-void MonteCarloPiSimulation::initSimulation(int cudaDevice, cudaStream_t stream)
-{
-    m_cudaDevice = cudaDevice;
-    getIdealExecutionConfiguration();
+void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) {
+  checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0,
+                                  sizeof(*m_numPointsInCircle), stream));

-    // Allocate a position buffer that contains random location of the points in XY cartesian plane.
-    // Allocate a bitmap buffer which holds information of whether a point in the position buffer is inside the unit circle or not.
-    setupSimulationAllocations();
+  monte_carlo_kernel<<<m_blocks, m_threads, 0, stream>>>(
+      m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
+  getLastCudaError("Failed to launch CUDA simulation");

-    checkCudaErrors(cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
-    checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, sizeof(*m_hostNumPointsInCircle)));
+  checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle,
+                                  sizeof(*m_numPointsInCircle),
+                                  cudaMemcpyDeviceToHost, stream));
+
+  // Queue up a stream callback to compute and print the PI value.
+  checkCudaErrors(
+      cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
 }

-void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream)
-{
-
-    checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, sizeof(*m_numPointsInCircle), stream));
-
-    monte_carlo_kernel << < m_blocks, m_threads, 0, stream >> > (m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
-    getLastCudaError("Failed to launch CUDA simulation");
-
-    checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle, sizeof(*m_numPointsInCircle), cudaMemcpyDeviceToHost, stream));
-
-    // Queue up a stream callback to compute and print the PI value.
-    checkCudaErrors(cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
+void MonteCarloPiSimulation::computePiCallback(void *args) {
+  MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
+  cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
+  cbData->m_totalPointsSimulated += cbData->m_numPoints;
+  double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle /
+                          (double)cbData->m_totalPointsSimulated);
+  printf("Approximate Pi value for %zd data points: %lf \n",
+         cbData->m_totalPointsSimulated, piValue);
 }

-void MonteCarloPiSimulation::computePiCallback(void *args)
-{
-    MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
-    cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
-    cbData->m_totalPointsSimulated += cbData->m_numPoints;
-    double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / (double)cbData->m_totalPointsSimulated);
-    printf("Approximate Pi value for %zd data points: %lf \n", cbData->m_totalPointsSimulated, piValue);
+void MonteCarloPiSimulation::getIdealExecutionConfiguration() {
+  int warpSize = 0;
+  int multiProcessorCount = 0;
+
+  checkCudaErrors(cudaSetDevice(m_cudaDevice));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
+
+  // We don't need large block sizes, since there's not much inter-thread
+  // communication
+  m_threads = warpSize;
+
+  // Use the occupancy calculator and fill the gpu as best as we can
+  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &m_blocks, monte_carlo_kernel, warpSize, 0));
+
+  checkCudaErrors(cudaDeviceGetAttribute(
+      &multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
+  m_blocks *= multiProcessorCount;
+
+  // Go ahead and the clamp the blocks to the minimum needed for this
+  // height/width
+  m_blocks =
+      std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
 }

-void MonteCarloPiSimulation::getIdealExecutionConfiguration()
-{
-    int warpSize = 0;
-    int multiProcessorCount = 0;
+void MonteCarloPiSimulation::setupSimulationAllocations() {
+  CUdeviceptr d_ptr = 0U;
+  size_t granularity = 0;
+  CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;

-    checkCudaErrors(cudaSetDevice(m_cudaDevice));
-    checkCudaErrors(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
+  CUmemAllocationProp allocProp = {};
+  allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  allocProp.location.id = m_cudaDevice;
+  allocProp.win32HandleMetaData = NULL;
+  allocProp.requestedHandleTypes = ipcHandleTypeFlag;

-    // We don't need large block sizes, since there's not much inter-thread communication
-    m_threads = warpSize;
+  // Windows-specific LPSECURITYATTRIBUTES is required when
+  // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope
+  // of which exported allocations may be tranferred to other processes. For all
+  // other handle types, pass NULL.
+  getDefaultSecurityDescriptor(&allocProp);

-    // Use the occupancy calculator and fill the gpu as best as we can
-    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, monte_carlo_kernel, warpSize, 0));
+  // Get the recommended granularity for m_cudaDevice.
+  checkCudaErrors(cuMemGetAllocationGranularity(
+      &granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));

-    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
-    m_blocks *= multiProcessorCount;
+  size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
+  size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);

-    // Go ahead and the clamp the blocks to the minimum needed for this height/width
-    m_blocks = std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
+  size_t xyPositionSize =
+      ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
+  size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
+  m_totalAllocationSize = (xyPositionSize + inCircleSize);
+
+  // Reserve the required contiguous VA space for the allocations
+  checkCudaErrors(
+      cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
+
+  // Create the allocations as a pinned allocation on this device.
+  // Create an allocation to store all the positions of points on the xy plane
+  // and a second allocation which stores information if the corresponding
+  // position is inside the unit circle or not.
+  checkCudaErrors(
+      cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
+  checkCudaErrors(
+      cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
+
+  // Export the allocation to a platform-specific handle. The type of handle
+  // requested here must match the requestedHandleTypes field in the prop
+  // structure passed to cuMemCreate. The handle obtained here will be passed to
+  // vulkan to import the allocation.
+  checkCudaErrors(cuMemExportToShareableHandle(
+      (void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
+  checkCudaErrors(
+      cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle,
+                                   cudaInCircleHandle, ipcHandleTypeFlag, 0));
+
+  CUdeviceptr va_position = d_ptr;
+  CUdeviceptr va_InCircle = va_position + xyPositionSize;
+  m_pointsInsideCircle = (float *)va_InCircle;
+  m_xyVector = (vec2 *)va_position;
+
+  // Assign the chunk to the appropriate VA range
+  checkCudaErrors(
+      cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
+  checkCudaErrors(
+      cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
+
+  // Release the handles for the allocation. Since the allocation is currently
+  // mapped to a VA range with a previous call to cuMemMap the actual freeing of
+  // memory allocation will happen on an eventual call to cuMemUnmap. Thus the
+  // allocation will be kept live until it is unmapped.
+  checkCudaErrors(cuMemRelease(cudaPositionHandle));
+  checkCudaErrors(cuMemRelease(cudaInCircleHandle));
+
+  CUmemAccessDesc accessDescriptor = {};
+  accessDescriptor.location.id = m_cudaDevice;
+  accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+  // Apply the access descriptor to the whole VA range. Essentially enables
+  // Read-Write access to the range.
+  checkCudaErrors(
+      cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
 }

-void MonteCarloPiSimulation::setupSimulationAllocations()
-{
-    CUdeviceptr d_ptr = 0U;
-    size_t granularity = 0;
-    CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;
+void MonteCarloPiSimulation::cleanupSimulationAllocations() {
+  if (m_xyVector && m_pointsInsideCircle) {
+    // Unmap the mapped virtual memory region
+    // Since the handles to the mapped backing stores have already been released
+    // by cuMemRelease, and these are the only/last mappings referencing them,
+    // The backing stores will be freed.
+    checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize));

-    CUmemAllocationProp allocProp = { };
-    allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    allocProp.location.id = m_cudaDevice;
-    allocProp.win32HandleMetaData = NULL;
-    allocProp.requestedHandleTypes = ipcHandleTypeFlag;
+    checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle));
+    checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));

-    // Windows-specific LPSECURITYATTRIBUTES is required when
-    // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope
-    // of which exported allocations may be tranferred to other processes. For all
-    // other handle types, pass NULL.
-    getDefaultSecurityDescriptor(&allocProp);
+    // Free the virtual address region.
+    checkCudaErrors(
+        cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));

-    // Get the recommended granularity for m_cudaDevice.
-    checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-
-    size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
-    size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);
-
-    size_t xyPositionSize = ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
-    size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
-    m_totalAllocationSize = (xyPositionSize + inCircleSize);
-
-    // Reserve the required contiguous VA space for the allocations
-    checkCudaErrors(cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
-
-    // Create the allocations as a pinned allocation on this device.
-    // Create an allocation to store all the positions of points on the xy plane and a second
-    // allocation which stores information if the corresponding position is inside the unit circle or not.
-    checkCudaErrors(cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
-    checkCudaErrors(cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
-
-    // Export the allocation to a platform-specific handle. The type of handle
-    // requested here must match the requestedHandleTypes field in the prop
-    // structure passed to cuMemCreate. The handle obtained here will be passed to vulkan
-    // to import the allocation.
-    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
-    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, cudaInCircleHandle, ipcHandleTypeFlag, 0));
-
-    CUdeviceptr va_position = d_ptr;
-    CUdeviceptr va_InCircle = va_position + xyPositionSize;
-    m_pointsInsideCircle = (float *)va_InCircle;
-    m_xyVector = (vec2 *)va_position;
-
-    // Assign the chunk to the appropriate VA range
-    checkCudaErrors(cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
-    checkCudaErrors(cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
-
-    // Release the handles for the allocation. Since the allocation is currently mapped to a VA range
-    // with a previous call to cuMemMap the actual freeing of memory allocation will happen on an eventual call to
-    // cuMemUnmap. Thus the allocation will be kept live until it is unmapped.
-    checkCudaErrors(cuMemRelease(cudaPositionHandle));
-    checkCudaErrors(cuMemRelease(cudaInCircleHandle));
-
-    CUmemAccessDesc accessDescriptor = {};
-    accessDescriptor.location.id = m_cudaDevice;
-    accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-
-    // Apply the access descriptor to the whole VA range. Essentially enables Read-Write access to the range.
-    checkCudaErrors(cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
-}
-
-void MonteCarloPiSimulation::cleanupSimulationAllocations()
-{
-    if (m_xyVector && m_pointsInsideCircle) {
-        // Unmap the mapped virtual memory region
-        // Since the handles to the mapped backing stores have already been released
-        // by cuMemRelease, and these are the only/last mappings referencing them,
-        // The backing stores will be freed.
-        checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize));
-
-        checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle));
-        checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));
-
-        // Free the virtual address region.
-        checkCudaErrors(cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));
-
-        m_xyVector = nullptr;
-        m_pointsInsideCircle = nullptr;
-    }
+    m_xyVector = nullptr;
+    m_pointsInsideCircle = nullptr;
+  }
 }
--- a/Samples/simpleVulkanMMAP/MonteCarloPi.h
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.h
@ -39,62 +39,57 @@

 typedef float vec2[2];

-class MonteCarloPiSimulation
-{
-    size_t m_numPoints;
+class MonteCarloPiSimulation {
+  size_t m_numPoints;

-    // Pointers to Cuda allocated buffers which are imported and used by vulkan as vertex buffer
-    vec2 *m_xyVector;
-    float *m_pointsInsideCircle;
+  // Pointers to Cuda allocated buffers which are imported and used by vulkan as
+  // vertex buffer
+  vec2 *m_xyVector;
+  float *m_pointsInsideCircle;

-    // Pointers to device and host allocated memories storing number of points that are inside the unit circle
-    float *m_numPointsInCircle;
-    float *m_hostNumPointsInCircle;
+  // Pointers to device and host allocated memories storing number of points
+  // that are inside the unit circle
+  float *m_numPointsInCircle;
+  float *m_hostNumPointsInCircle;

-    int m_blocks, m_threads;
+  int m_blocks, m_threads;

-    // Total size of allocations created by cuMemMap Apis. This size is the sum of sizes of
-    // m_xyVector and m_pointsInsideCircle buffers.
-    size_t m_totalAllocationSize;
+  // Total size of allocations created by cuMemMap Apis. This size is the sum of
+  // sizes of m_xyVector and m_pointsInsideCircle buffers.
+  size_t m_totalAllocationSize;

-    // Shareable Handles(a file descriptor on Linux and NT Handle on Windows), used for sharing cuda
-    // allocated memory with Vulkan
-    ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;
+  // Shareable Handles(a file descriptor on Linux and NT Handle on Windows),
+  // used for sharing cuda
+  // allocated memory with Vulkan
+  ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;

-    // Cuda Device corresponding to the Vulkan Physical device
-    int m_cudaDevice;
+  // Cuda Device corresponding to the Vulkan Physical device
+  int m_cudaDevice;

-    // Track and accumulate total points that have been simulated since start of the sample.
-    // The idea is to get a closer approximation to PI with time.
-    size_t m_totalPointsInsideCircle;
-    size_t m_totalPointsSimulated;
+  // Track and accumulate total points that have been simulated since start of
+  // the sample. The idea is to get a closer approximation to PI with time.
+  size_t m_totalPointsInsideCircle;
+  size_t m_totalPointsSimulated;

-    void setupSimulationAllocations();
-    void cleanupSimulationAllocations();
-    void getIdealExecutionConfiguration();
+  void setupSimulationAllocations();
+  void cleanupSimulationAllocations();
+  void getIdealExecutionConfiguration();

-public:
-    MonteCarloPiSimulation(size_t num_points);
-    ~MonteCarloPiSimulation();
-    void initSimulation(int cudaDevice, cudaStream_t stream = 0);
-    void stepSimulation(float time, cudaStream_t stream = 0);
-    static void computePiCallback(void *args);
+ public:
+  MonteCarloPiSimulation(size_t num_points);
+  ~MonteCarloPiSimulation();
+  void initSimulation(int cudaDevice, cudaStream_t stream = 0);
+  void stepSimulation(float time, cudaStream_t stream = 0);
+  static void computePiCallback(void *args);

-    size_t getNumPoints() const {
-        return m_numPoints;
-    }
+  size_t getNumPoints() const { return m_numPoints; }

-    float getNumPointsInCircle() const {
-        return *m_hostNumPointsInCircle;
-    }
-
-    ShareableHandle &getPositionShareableHandle() {
-        return m_posShareableHandle;
-    }
-    ShareableHandle &getInCircleShareableHandle() {
-        return m_inCircleShareableHandle;
-    }
+  float getNumPointsInCircle() const { return *m_hostNumPointsInCircle; }

+  ShareableHandle &getPositionShareableHandle() { return m_posShareableHandle; }
+  ShareableHandle &getInCircleShareableHandle() {
+    return m_inCircleShareableHandle;
+  }
 };

-#endif // __PISIM_H__
+#endif  // __PISIM_H__
--- a/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
--- a/Samples/simpleVulkanMMAP/VulkanBaseApp.h
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.h
@ -40,101 +40,119 @@

 struct GLFWwindow;

-class VulkanBaseApp
-{
-public:
-    VulkanBaseApp(const std::string& appName, bool enableValidation = false);
-    static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
-    static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
-    virtual ~VulkanBaseApp();
-    void init();
-    void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType);
-    void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    bool isVkPhysicalDeviceUuid(void *Uuid);
-    void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory);
-    void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
-    VkCommandBuffer beginSingleTimeCommands();
-    void endSingleTimeCommands(VkCommandBuffer commandBuffer);
-    void mainLoop();
-protected:
-    const std::string m_appName;
-    const bool m_enableValidation;
-    VkInstance m_instance;
-    VkDebugUtilsMessengerEXT m_debugMessenger;
-    VkSurfaceKHR m_surface;
-    VkPhysicalDevice m_physicalDevice;
-    uint8_t m_deviceUUID[VK_UUID_SIZE];
-    VkDevice m_device;
-    VkQueue m_graphicsQueue;
-    VkQueue m_presentQueue;
-    VkSwapchainKHR m_swapChain;
-    std::vector<VkImage> m_swapChainImages;
-    VkFormat m_swapChainFormat;
-    VkExtent2D m_swapChainExtent;
-    std::vector<VkImageView> m_swapChainImageViews;
-    std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
-    VkRenderPass m_renderPass;
-    VkPipelineLayout m_pipelineLayout;
-    VkPipeline m_graphicsPipeline;
-    std::vector<VkFramebuffer> m_swapChainFramebuffers;
-    VkCommandPool m_commandPool;
-    std::vector<VkCommandBuffer> m_commandBuffers;
-    std::vector<VkSemaphore> m_imageAvailableSemaphores;
-    std::vector<VkSemaphore> m_renderFinishedSemaphores;
-    std::vector<VkFence> m_inFlightFences;
-    std::vector<VkBuffer> m_uniformBuffers;
-    std::vector<VkDeviceMemory> m_uniformMemory;
-    VkDescriptorSetLayout m_descriptorSetLayout;
-    VkDescriptorPool m_descriptorPool;
-    std::vector<VkDescriptorSet> m_descriptorSets;
+class VulkanBaseApp {
+ public:
+  VulkanBaseApp(const std::string& appName, bool enableValidation = false);
+  static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
+  static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
+  virtual ~VulkanBaseApp();
+  void init();
+  void* getMemHandle(VkDeviceMemory memory,
+                     VkExternalMemoryHandleTypeFlagBits handleType);
+  void* getSemaphoreHandle(VkSemaphore semaphore,
+                           VkExternalSemaphoreHandleTypeFlagBits handleType);
+  bool isVkPhysicalDeviceUuid(void* Uuid);
+  void createExternalSemaphore(
+      VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+  void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                    VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                    VkDeviceMemory& bufferMemory);
+  void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                            VkMemoryPropertyFlags properties,
+                            VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
+                            VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+  void importExternalBuffer(void* handle,
+                            VkExternalMemoryHandleTypeFlagBits handleType,
+                            size_t size, VkBufferUsageFlags usage,
+                            VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                            VkDeviceMemory& memory);
+  void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
+  VkCommandBuffer beginSingleTimeCommands();
+  void endSingleTimeCommands(VkCommandBuffer commandBuffer);
+  void mainLoop();

-    VkImage m_depthImage;
-    VkDeviceMemory m_depthImageMemory;
-    VkImageView m_depthImageView;
-    size_t m_currentFrame;
-    bool m_framebufferResized;
+ protected:
+  const std::string m_appName;
+  const bool m_enableValidation;
+  VkInstance m_instance;
+  VkDebugUtilsMessengerEXT m_debugMessenger;
+  VkSurfaceKHR m_surface;
+  VkPhysicalDevice m_physicalDevice;
+  uint8_t m_deviceUUID[VK_UUID_SIZE];
+  VkDevice m_device;
+  VkQueue m_graphicsQueue;
+  VkQueue m_presentQueue;
+  VkSwapchainKHR m_swapChain;
+  std::vector<VkImage> m_swapChainImages;
+  VkFormat m_swapChainFormat;
+  VkExtent2D m_swapChainExtent;
+  std::vector<VkImageView> m_swapChainImageViews;
+  std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
+  VkRenderPass m_renderPass;
+  VkPipelineLayout m_pipelineLayout;
+  VkPipeline m_graphicsPipeline;
+  std::vector<VkFramebuffer> m_swapChainFramebuffers;
+  VkCommandPool m_commandPool;
+  std::vector<VkCommandBuffer> m_commandBuffers;
+  std::vector<VkSemaphore> m_imageAvailableSemaphores;
+  std::vector<VkSemaphore> m_renderFinishedSemaphores;
+  std::vector<VkFence> m_inFlightFences;
+  std::vector<VkBuffer> m_uniformBuffers;
+  std::vector<VkDeviceMemory> m_uniformMemory;
+  VkDescriptorSetLayout m_descriptorSetLayout;
+  VkDescriptorPool m_descriptorPool;
+  std::vector<VkDescriptorSet> m_descriptorSets;

-    virtual void initVulkanApp() {}
-    virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
-    virtual std::vector<const char *> getRequiredExtensions() const;
-    virtual std::vector<const char *> getRequiredDeviceExtensions() const;
-    virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc);
-    virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info);
-    virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const;
-    virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
-    virtual VkDeviceSize getUniformSize() const;
-    virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
-    virtual void drawFrame();
-private:
-    GLFWwindow *m_window;
+  VkImage m_depthImage;
+  VkDeviceMemory m_depthImageMemory;
+  VkImageView m_depthImageView;
+  size_t m_currentFrame;
+  bool m_framebufferResized;

-    void initWindow();
-    void initVulkan();
-    void createInstance();
-    void createSurface();
-    void createDevice();
-    void createSwapChain();
-    void createImageViews();
-    void createRenderPass();
-    void createDescriptorSetLayout();
-    void createGraphicsPipeline();
-    void createFramebuffers();
-    void createCommandPool();
-    void createDepthResources();
-    void createUniformBuffers();
-    void createDescriptorPool();
-    void createDescriptorSets();
-    void createCommandBuffers();
-    void createSyncObjects();
+  virtual void initVulkanApp() {}
+  virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
+  virtual std::vector<const char*> getRequiredExtensions() const;
+  virtual std::vector<const char*> getRequiredDeviceExtensions() const;
+  virtual void getVertexDescriptions(
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
+      std::vector<VkVertexInputAttributeDescription>& attribDesc);
+  virtual void getAssemblyStateInfo(
+      VkPipelineInputAssemblyStateCreateInfo& info);
+  virtual void getWaitFrameSemaphores(
+      std::vector<VkSemaphore>& wait,
+      std::vector<VkPipelineStageFlags>& waitStages) const;
+  virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
+  virtual VkDeviceSize getUniformSize() const;
+  virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
+  virtual void drawFrame();

-    void cleanupSwapChain();
-    void recreateSwapChain();
+ private:
+  GLFWwindow* m_window;

-    bool isSuitableDevice(VkPhysicalDevice dev) const;
-    static void resizeCallback(GLFWwindow *window, int width, int height);
+  void initWindow();
+  void initVulkan();
+  void createInstance();
+  void createSurface();
+  void createDevice();
+  void createSwapChain();
+  void createImageViews();
+  void createRenderPass();
+  void createDescriptorSetLayout();
+  void createGraphicsPipeline();
+  void createFramebuffers();
+  void createCommandPool();
+  void createDepthResources();
+  void createUniformBuffers();
+  void createDescriptorPool();
+  void createDescriptorSets();
+  void createCommandBuffers();
+  void createSyncObjects();
+
+  void cleanupSwapChain();
+  void recreateSwapChain();
+
+  bool isSuitableDevice(VkPhysicalDevice dev) const;
+  static void resizeCallback(GLFWwindow* window, int width, int height);
 };

 void readFile(std::istream& s, std::vector<char>& data);
--- a/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
+++ b/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
@ -35,41 +35,48 @@
 #include <helper_cuda.h>

 bool isDeviceCompatible(void *Uuid, size_t size) {
+  int cudaDevice = cudaInvalidDeviceId;
+  int deviceCount;
+  checkCudaErrors(cudaGetDeviceCount(&deviceCount));

-    int cudaDevice = cudaInvalidDeviceId;
-    int deviceCount;
-    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
-
-    for (int i = 0; i < deviceCount; ++i) {
-        cudaDeviceProp devProp = { };
-        checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
-        if (!memcmp(&devProp.uuid, Uuid, size)) {
-            cudaDevice = i;
-            break;
-        }
-    }
-    if (cudaDevice == cudaInvalidDeviceId) {
-        return false;
+  for (int i = 0; i < deviceCount; ++i) {
+    cudaDeviceProp devProp = {};
+    checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
+    if (!memcmp(&devProp.uuid, Uuid, size)) {
+      cudaDevice = i;
+      break;
    }
+  }
+  if (cudaDevice == cudaInvalidDeviceId) {
+    return false;
+  }

-    int deviceSupportsHandle = 0;
-    int attributeVal = 0;
-    int deviceComputeMode = 0;
+  int deviceSupportsHandle = 0;
+  int attributeVal = 0;
+  int deviceComputeMode = 0;

-    checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
-    checkCudaErrors(cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+      cudaDevice));

 #if defined(__linux__)
-    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &deviceSupportsHandle,
+      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED,
+      cudaDevice));
 #else
-    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &deviceSupportsHandle,
+      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
 #endif

-    if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || !deviceSupportsHandle) {
-        return false;
-    }
-    return true;
+  if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal ||
+      !deviceSupportsHandle) {
+    return false;
+  }
+  return true;
 }

-#endif // __VKCUDA_H__
-
+#endif  // __VKCUDA_H__
--- a/Samples/simpleVulkanMMAP/frag.spv
+++ b/Samples/simpleVulkanMMAP/frag.spv
--- a/Samples/simpleVulkanMMAP/main.cpp
+++ b/Samples/simpleVulkanMMAP/main.cpp
@ -25,11 +25,12 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

- /*
-  * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
-  * Allocating device memory and updating values in those allocations are performed by CUDA
-  * and the contents of the allocation are visualized by Vulkan.
-  */
+/*
+ * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
+ * Allocating device memory and updating values in those allocations are
+ * performed by CUDA and the contents of the allocation are visualized by
+ * Vulkan.
+ */

 #include "VulkanBaseApp.h"

@ -55,25 +56,23 @@

 std::string execution_path;

-class VulkanCudaPi : public VulkanBaseApp
-{
-    typedef struct UniformBufferObject_st {
-        float frame;
-    } UniformBufferObject;
+class VulkanCudaPi : public VulkanBaseApp {
+  typedef struct UniformBufferObject_st { float frame; } UniformBufferObject;

-    VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
-    VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
-    VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
-    MonteCarloPiSimulation m_sim;
-    UniformBufferObject m_ubo;
-    cudaStream_t m_stream;
-    cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
-    using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
-    chrono_tp m_lastTime;
-    size_t m_lastFrame;
-public:
-    VulkanCudaPi(size_t num_points) :
-        VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
+  VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
+  VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
+  VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
+  MonteCarloPiSimulation m_sim;
+  UniformBufferObject m_ubo;
+  cudaStream_t m_stream;
+  cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
+  using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  chrono_tp m_lastTime;
+  size_t m_lastFrame;
+
+ public:
+  VulkanCudaPi(size_t num_points)
+      : VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
        m_inCircleBuffer(VK_NULL_HANDLE),
        m_xyPositionBuffer(VK_NULL_HANDLE),
        m_inCircleMemory(VK_NULL_HANDLE),
@ -86,232 +85,268 @@ public:
        m_cudaWaitSemaphore(),
        m_cudaSignalSemaphore(),
        m_lastFrame(0) {
+    // Add our compiled vulkan shader files
+    char* vertex_shader_path =
+        sdkFindFilePath("vert.spv", execution_path.c_str());
+    char* fragment_shader_path =
+        sdkFindFilePath("frag.spv", execution_path.c_str());
+    m_shaderFiles.push_back(
+        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
+    m_shaderFiles.push_back(
+        std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
+  }

-        // Add our compiled vulkan shader files
-        char* vertex_shader_path = sdkFindFilePath("montecarlo.vert", execution_path.c_str());
-        char* fragment_shader_path = sdkFindFilePath("montecarlo.frag", execution_path.c_str());
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
+  ~VulkanCudaPi() {
+    if (m_stream) {
+      // Make sure there's no pending work before we start tearing down
+      checkCudaErrors(cudaStreamSynchronize(m_stream));
+      checkCudaErrors(cudaStreamDestroy(m_stream));
    }

-    ~VulkanCudaPi() {
-        if (m_stream) {
-            // Make sure there's no pending work before we start tearing down
-            checkCudaErrors(cudaStreamSynchronize(m_stream));
-            checkCudaErrors(cudaStreamDestroy(m_stream));
-        }
+    if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
+      vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
+    }
+    if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
+      vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
+    }
+    if (m_xyPositionBuffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr);
+    }
+    if (m_xyPositionMemory != VK_NULL_HANDLE) {
+      vkFreeMemory(m_device, m_xyPositionMemory, nullptr);
+    }
+    if (m_inCircleBuffer != VK_NULL_HANDLE) {
+      vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr);
+    }
+    if (m_inCircleMemory != VK_NULL_HANDLE) {
+      vkFreeMemory(m_device, m_inCircleMemory, nullptr);
+    }
+  }

-        if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
-            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
-            vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
-        }
-        if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
-            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
-            vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
-        }
-        if (m_xyPositionBuffer != VK_NULL_HANDLE) {
-            vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr);
-        }
-        if (m_xyPositionMemory != VK_NULL_HANDLE) {
-            vkFreeMemory(m_device, m_xyPositionMemory, nullptr);
-        }
-        if (m_inCircleBuffer != VK_NULL_HANDLE) {
-            vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr);
-        }
-        if (m_inCircleMemory != VK_NULL_HANDLE) {
-            vkFreeMemory(m_device, m_inCircleMemory, nullptr);
-        }
+  void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
+    VkBuffer vertexBuffers[] = {m_inCircleBuffer, m_xyPositionBuffer};
+    VkDeviceSize offsets[] = {0, 0};
+    vkCmdBindVertexBuffers(commandBuffer, 0,
+                           sizeof(vertexBuffers) / sizeof(vertexBuffers[0]),
+                           vertexBuffers, offsets);
+    vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
+  }
+
+  void getVertexDescriptions(
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
+      std::vector<VkVertexInputAttributeDescription>& attribDesc) {
+    bindingDesc.resize(2);
+    attribDesc.resize(2);
+
+    bindingDesc[0].binding = 0;
+    bindingDesc[0].stride = sizeof(float);
+    bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+    bindingDesc[1].binding = 1;
+    bindingDesc[1].stride = sizeof(vec2);
+    bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+    attribDesc[0].binding = 0;
+    attribDesc[0].location = 0;
+    attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
+    attribDesc[0].offset = 0;
+
+    attribDesc[1].binding = 1;
+    attribDesc[1].location = 1;
+    attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
+    attribDesc[1].offset = 0;
+  }
+
+  void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
+    info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+    info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+    info.primitiveRestartEnable = VK_FALSE;
+  }
+
+  void getWaitFrameSemaphores(
+      std::vector<VkSemaphore>& wait,
+      std::vector<VkPipelineStageFlags>& waitStages) const {
+    if (m_currentFrame != 0) {
+      // Have vulkan wait until cuda is done with the vertex buffer before
+      // rendering
+      // We don't do this on the first frame, as the wait semaphore hasn't been
+      // initialized yet
+      wait.push_back(m_vkWaitSemaphore);
+      // We want to wait until all the pipeline commands are complete before
+      // letting cuda work
+      waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    }
+  }
+
+  void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
+    // Add this semaphore for vulkan to signal once the vertex buffer is ready
+    // for cuda to modify
+    signal.push_back(m_vkSignalSemaphore);
+  }
+
+  void initVulkanApp() {
+    const size_t nVerts = m_sim.getNumPoints();
+
+    // Obtain cuda device id for the device corresponding to the Vulkan physical
+    // device
+    int deviceCount;
+    int cudaDevice = cudaInvalidDeviceId;
+    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+    for (int dev = 0; dev < deviceCount; ++dev) {
+      cudaDeviceProp devProp = {};
+      checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
+      if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
+        cudaDevice = dev;
+        break;
+      }
+    }
+    if (cudaDevice == cudaInvalidDeviceId) {
+      throw std::runtime_error("No Suitable device found!");
    }

-    void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
-        VkBuffer vertexBuffers[] = { m_inCircleBuffer, m_xyPositionBuffer };
-        VkDeviceSize offsets[] = { 0, 0 };
-        vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets);
-        vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
+    // On the corresponding cuda device, create the cuda stream we'll using
+    checkCudaErrors(cudaSetDevice(cudaDevice));
+    checkCudaErrors(
+        cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
+    m_sim.initSimulation(cudaDevice, m_stream);
+
+    importExternalBuffer(
+        (void*)(uintptr_t)m_sim.getPositionShareableHandle(),
+        getDefaultMemHandleType(), nVerts * sizeof(vec2),
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer,
+        m_xyPositionMemory);
+
+    importExternalBuffer(
+        (void*)(uintptr_t)m_sim.getInCircleShareableHandle(),
+        getDefaultMemHandleType(), nVerts * sizeof(float),
+        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer,
+        m_inCircleMemory);
+
+    // Create the semaphore vulkan will signal when it's done with the vertex
+    // buffer
+    createExternalSemaphore(m_vkSignalSemaphore,
+                            getDefaultSemaphoreHandleType());
+    // Create the semaphore vulkan will wait for before using the vertex buffer
+    createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+    // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
+    importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore,
+                                getDefaultSemaphoreHandleType());
+    // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
+    importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore,
+                                getDefaultSemaphoreHandleType());
+  }
+
+  void importCudaExternalSemaphore(
+      cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem,
+      VkExternalSemaphoreHandleTypeFlagBits handleType) {
+    cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
+
+    if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueWin32;
+    } else if (handleType &
+               VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
+    } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueFd;
+    } else {
+      throw std::runtime_error("Unknown handle type requested!");
    }

-    void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) {
-        bindingDesc.resize(2);
-        attribDesc.resize(2);
-
-        bindingDesc[0].binding = 0;
-        bindingDesc[0].stride = sizeof(float);
-        bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-
-        bindingDesc[1].binding = 1;
-        bindingDesc[1].stride = sizeof(vec2);
-        bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-
-        attribDesc[0].binding = 0;
-        attribDesc[0].location = 0;
-        attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
-        attribDesc[0].offset = 0;
-
-        attribDesc[1].binding = 1;
-        attribDesc[1].location = 1;
-        attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
-        attribDesc[1].offset = 0;
-    }
-
-    void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
-        info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
-        info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
-        info.primitiveRestartEnable = VK_FALSE;
-    }
-
-    void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const {
-        if (m_currentFrame != 0) {
-            // Have vulkan wait until cuda is done with the vertex buffer before rendering
-            // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet
-            wait.push_back(m_vkWaitSemaphore);
-            // We want to wait until all the pipeline commands are complete before letting cuda work
-            waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
-        }
-    }
-
-    void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
-        // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify
-        signal.push_back(m_vkSignalSemaphore);
-    }
-
-    void initVulkanApp() {
-        const size_t nVerts = m_sim.getNumPoints();
-
-        // Obtain cuda device id for the device corresponding to the Vulkan physical device
-        int deviceCount;
-        int cudaDevice = cudaInvalidDeviceId;
-        checkCudaErrors(cudaGetDeviceCount(&deviceCount));
-        for (int dev = 0; dev < deviceCount; ++dev) {
-            cudaDeviceProp devProp = { };
-            checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
-            if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
-                cudaDevice = dev;
-                break;
-            }
-        }
-        if (cudaDevice == cudaInvalidDeviceId) {
-            throw std::runtime_error("No Suitable device found!");
-        }
-
-        // On the corresponding cuda device, create the cuda stream we'll using
-        checkCudaErrors(cudaSetDevice(cudaDevice));
-        checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
-        m_sim.initSimulation(cudaDevice, m_stream);
-
-        importExternalBuffer((void *)(uintptr_t)m_sim.getPositionShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(vec2),
-            VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer, m_xyPositionMemory);
-
-        importExternalBuffer((void *)(uintptr_t)m_sim.getInCircleShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(float),
-            VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer, m_inCircleMemory);
-
-        // Create the semaphore vulkan will signal when it's done with the vertex buffer
-        createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
-        // Create the semaphore vulkan will wait for before using the vertex buffer
-        createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
-        // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
-        importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
-        // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
-        importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
-    }
-
-    void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) {
-        cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
-
-        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
-        }
-        else {
-            throw std::runtime_error("Unknown handle type requested!");
-        }
-
 #ifdef _WIN64
-        externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.win32.handle =
+        (HANDLE)getSemaphoreHandle(vkSem, handleType);
 #else
-        externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.fd =
+        (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
 #endif

-        externalSemaphoreHandleDesc.flags = 0;
+    externalSemaphoreHandleDesc.flags = 0;

-        checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
-    }
+    checkCudaErrors(
+        cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+  }

-    VkDeviceSize getUniformSize() const {
-        return sizeof(UniformBufferObject);
-    }
+  VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); }

-    void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
-        m_ubo.frame = (float)globalFrame;
-        void *data;
-        vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data);
-        memcpy(data, &m_ubo, sizeof(m_ubo));
-        vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
-    }
+  void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
+    m_ubo.frame = (float)globalFrame;
+    void* data;
+    vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0,
+                &data);
+    memcpy(data, &m_ubo, sizeof(m_ubo));
+    vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
+  }

-    std::vector<const char *> getRequiredExtensions() const {
-        std::vector<const char *> extensions;
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
-        return extensions;
-    }
+  std::vector<const char*> getRequiredExtensions() const {
+    std::vector<const char*> extensions;
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(
+        VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
+    return extensions;
+  }

-    std::vector<const char *> getRequiredDeviceExtensions() const {
-        std::vector<const char *> extensions;
+  std::vector<const char*> getRequiredDeviceExtensions() const {
+    std::vector<const char*> extensions;

-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
 #ifdef _WIN64
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
 #else
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
 #endif /* _WIN64 */
-        return extensions;
+    return extensions;
+  }
+
+  void drawFrame() {
+    static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+
+    chrono_tp currentTime = std::chrono::high_resolution_clock::now();
+    float time = std::chrono::duration<float, std::chrono::seconds::period>(
+                     currentTime - startTime)
+                     .count();
+
+    if (m_currentFrame == 0) {
+      m_lastTime = startTime;
    }

-    void drawFrame() {
-        static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+    cudaExternalSemaphoreWaitParams waitParams = {};
+    waitParams.flags = 0;
+    waitParams.params.fence.value = 0;

-        chrono_tp currentTime = std::chrono::high_resolution_clock::now();
-        float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count();
+    cudaExternalSemaphoreSignalParams signalParams = {};
+    signalParams.flags = 0;
+    signalParams.params.fence.value = 0;

-        if (m_currentFrame == 0) {
-            m_lastTime = startTime;
-        }
+    // Have vulkan draw the current frame...
+    VulkanBaseApp::drawFrame();
+    // Wait for vulkan to complete it's work
+    checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore,
+                                                    &waitParams, 1, m_stream));
+    // Now step the simulation
+    m_sim.stepSimulation(time, m_stream);

-        cudaExternalSemaphoreWaitParams waitParams = {};
-        waitParams.flags = 0;
-        waitParams.params.fence.value = 0;
-
-        cudaExternalSemaphoreSignalParams signalParams = {};
-        signalParams.flags = 0;
-        signalParams.params.fence.value = 0;
-
-        // Have vulkan draw the current frame...
-        VulkanBaseApp::drawFrame();
-        // Wait for vulkan to complete it's work
-        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream));
-        // Now step the simulation
-        m_sim.stepSimulation(time, m_stream);
-
-        // Signal vulkan to continue with the updated buffers
-        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
-    }
+    // Signal vulkan to continue with the updated buffers
+    checkCudaErrors(cudaSignalExternalSemaphoresAsync(
+        &m_cudaSignalSemaphore, &signalParams, 1, m_stream));
+  }
 };

-int main(int argc, char **argv)
-{
-    execution_path = argv[0];
-    VulkanCudaPi app(NUM_SIMULATION_POINTS);
-    app.init();
-    app.mainLoop();
-    return 0;
+int main(int argc, char** argv) {
+  execution_path = argv[0];
+  VulkanCudaPi app(NUM_SIMULATION_POINTS);
+  app.init();
+  app.mainLoop();
+  return 0;
 }
--- a/Samples/simpleVulkanMMAP/vert.spv
+++ b/Samples/simpleVulkanMMAP/vert.spv
--- a/Samples/vulkanImageCUDA/Build_instructions.txt
+++ b/Samples/vulkanImageCUDA/Build_instructions.txt
@ -19,8 +19,17 @@ For Linux:
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH

+
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+
+
+For Shader changes:
+-- Update the shader.vert and/or shader.frag shader source file as required
+-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
+    glslc shader.vert -o vert.spv
+    glslc shader.frag -o frag.spv
+** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/vulkanImageCUDA/frag.spv
+++ b/Samples/vulkanImageCUDA/frag.spv
--- a/Samples/vulkanImageCUDA/vert.spv
+++ b/Samples/vulkanImageCUDA/vert.spv
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA.cu
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA.cu
@ -69,7 +69,7 @@ const std::vector<const char*> validationLayers = {
    "VK_LAYER_KHRONOS_validation"};

 #ifdef NDEBUG
-const bool enableValidationLayers = false;
+const bool enableValidationLayers = true;
 #else
 const bool enableValidationLayers = false;
 #endif
@ -494,7 +494,7 @@ class vulkanImageCUDA {

  unsigned int* image_data = NULL;
  unsigned int imageWidth, imageHeight;
-  unsigned int mipLevels;
+  unsigned int mipLevels = 1;
  size_t totalImageMemSize;

  // CUDA objects
@ -630,6 +630,9 @@ class vulkanImageCUDA {
    vkDestroyBuffer(device, vertexBuffer, nullptr);
    vkFreeMemory(device, vertexBufferMemory, nullptr);

+    vkDestroySemaphore(device, cudaUpdateVkSemaphore, nullptr);
+    vkDestroySemaphore(device, vkUpdateCudaSemaphore, nullptr);
+
    for (size_t i = 0; i < MAX_FRAMES; i++) {
      vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr);
      vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr);
@ -686,7 +689,7 @@ class vulkanImageCUDA {
    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.pEngineName = "No Engine";
    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.apiVersion = VK_API_VERSION_1_0;
+    appInfo.apiVersion = VK_API_VERSION_1_1;

    VkInstanceCreateInfo createInfo = {};
    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
@ -905,6 +908,7 @@ class vulkanImageCUDA {
    }

    VkPhysicalDeviceFeatures deviceFeatures = {};
+    deviceFeatures.samplerAnisotropy = VK_TRUE;

    VkDeviceCreateInfo createInfo = {};
    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
@ -1078,8 +1082,8 @@ class vulkanImageCUDA {
  }

  void createGraphicsPipeline() {
-    auto vertShaderCode = readFile("shader.vert");
-    auto fragShaderCode = readFile("shader.frag");
+    auto vertShaderCode = readFile("vert.spv");
+    auto fragShaderCode = readFile("frag.spv");

    VkShaderModule vertShaderModule = createShaderModule(vertShaderCode);
    VkShaderModule fragShaderModule = createShaderModule(fragShaderCode);
@ -1268,7 +1272,7 @@ class vulkanImageCUDA {

    // VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT
    createImage(
-        imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UINT,
+        imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UNORM,
        VK_IMAGE_TILING_OPTIMAL,
        VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
            VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
@ -1280,9 +1284,6 @@ class vulkanImageCUDA {
    copyBufferToImage(stagingBuffer, textureImage,
                      static_cast<uint32_t>(imageWidth),
                      static_cast<uint32_t>(imageHeight));
-    transitionImageLayout(textureImage, VK_FORMAT_R8G8B8A8_UINT,
-                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                          VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);

    vkDestroyBuffer(device, stagingBuffer, nullptr);
    vkFreeMemory(device, stagingBufferMemory, nullptr);
@ -1523,8 +1524,13 @@ class vulkanImageCUDA {
    vkExternalMemImageCreateInfo.sType =
        VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
    vkExternalMemImageCreateInfo.pNext = NULL;
+#ifdef _WIN64
+    vkExternalMemImageCreateInfo.handleTypes =
+        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+#else
    vkExternalMemImageCreateInfo.handleTypes =
        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+#endif

    imageInfo.pNext = &vkExternalMemImageCreateInfo;

@ -2147,7 +2153,7 @@ class vulkanImageCUDA {
      if (vkCreateSemaphore(device, &semaphoreInfo, nullptr,
                            &imageAvailableSemaphores[i]) != VK_SUCCESS ||
          vkCreateSemaphore(device, &semaphoreInfo, nullptr,
-                            &renderFinishedSemaphores[i]) != VK_SUCCESS || 
+                            &renderFinishedSemaphores[i]) != VK_SUCCESS ||
          vkCreateFence(device, &fenceInfo, nullptr, &inFlightFences[i]) !=
              VK_SUCCESS) {
        throw std::runtime_error(
@ -2201,7 +2207,6 @@ class vulkanImageCUDA {
      throw std::runtime_error(
          "failed to create synchronization objects for a CUDA-Vulkan!");
    }
-
  }

  void updateUniformBuffer() {
@ -2333,8 +2338,8 @@ class vulkanImageCUDA {
    submitInfo.signalSemaphoreCount = 2;
    submitInfo.pSignalSemaphores = signalSemaphores;

-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) !=
-        VK_SUCCESS) {
+    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
+                      inFlightFences[currentFrame]) != VK_SUCCESS) {
      throw std::runtime_error("failed to submit draw command buffer!");
    }
  }
@ -2360,8 +2365,8 @@ class vulkanImageCUDA {
    submitInfo.signalSemaphoreCount = 2;
    submitInfo.pSignalSemaphores = signalSemaphores;

-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) !=
-        VK_SUCCESS) {
+    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
+                      inFlightFences[currentFrame]) != VK_SUCCESS) {
      throw std::runtime_error("failed to submit draw command buffer!");
    }
  }