update vulkan samples with SPIR-V shaders

2026-02-05 13:45:41 +08:00 · 2021-06-02 17:17:21 +05:30 · 2021-06-02 17:17:21 +05:30 · 7a5b3e6c8c
commit 7a5b3e6c8c
parent 5c3ec60fae
17 changed files with 2368 additions and 2116 deletions
--- a/Samples/simpleVulkan/Build_instructions.txt
+++ b/Samples/simpleVulkan/Build_instructions.txt
@ -19,8 +19,17 @@ For Linux:
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
 For Shader changes:
 -- Update the sinewave.vert and/or sinewave.frag shader source file as required
 -- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
    glslc sinewave.vert -o vert.spv
    glslc sinewave.frag -o frag.spv
 ** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/simpleVulkan/frag.spv
+++ b/Samples/simpleVulkan/frag.spv
--- a/Samples/simpleVulkan/main.cpp
+++ b/Samples/simpleVulkan/main.cpp
@ -92,9 +92,9 @@ class VulkanCudaSineWave : public VulkanBaseApp {
    }
    // Add our compiled vulkan shader files
    char *vertex_shader_path =
-        sdkFindFilePath("sinewave.vert", execution_path.c_str());
+        sdkFindFilePath("vert.spv", execution_path.c_str());
    char *fragment_shader_path =
-        sdkFindFilePath("sinewave.frag", execution_path.c_str());
+        sdkFindFilePath("frag.spv", execution_path.c_str());
    m_shaderFiles.push_back(
        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
    m_shaderFiles.push_back(
--- a/Samples/simpleVulkan/vert.spv
+++ b/Samples/simpleVulkan/vert.spv
--- a/Samples/simpleVulkanMMAP/Build_instructions.txt
+++ b/Samples/simpleVulkanMMAP/Build_instructions.txt
@ -0,0 +1,35 @@
 For Windows:
 Follow these steps once you have installed Vulkan SDK for Windows from https://www.lunarg.com/vulkan-sdk/
 -- Install GLFW3 library at suitable location
 -- Open the simpleVulkan VS project file.
 To add the GLFW3 library path
 -- Right click on Project name "simpleVulkan" click on "Properties"
 -- In Property pages window go to Linker -> General. Here in "Additional Libraries Directories" edit and add path to glfw3dll.lib
 To add the GLFW3 headers path
 -- Right click on Project name "simpleVulkan" click on "Properties"
 -- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location.
 ** Make sure to add path to glfw3.dll in your PATH environment variable**
 For Linux:
 -- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/  and follow environment setup instructions.
 -- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS. Below is for Ubuntu:
    sudo apt-get install libglfw3
    sudo apt-get install libglfw3-dev
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
 For Shader changes:
 -- Update the montecarlo.vert and/or montecarlo.frag shader source file as required
 -- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
    glslc montecarlo.vert -o vert.spv
    glslc montecarlo.frag -o frag.spv
 ** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/simpleVulkanMMAP/MonteCarloPi.cu
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.cu
@ -25,9 +25,9 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
- /*
+/*
-  * See: https://www.piday.org/million/
+ * See: https://www.piday.org/million/
-  */
+ */
 #include "MonteCarloPi.h"
 #include <algorithm>
@ -37,15 +37,16 @@
 #define ROUND_UP_TO_GRANULARITY(x, n) (((x + n - 1) / n) * n)
-  // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
+// `ipcHandleTypeFlag` specifies the platform specific handle type this sample
-  // uses for importing and exporting memory allocation. On Linux this sample
+// uses for importing and exporting memory allocation. On Linux this sample
-  // specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
+// specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
-  // file descriptors will be used. On Windows this sample specifies the type as
+// file descriptors will be used. On Windows this sample specifies the type as
-  // CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
+// CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
-  // ipcHandleTypeFlag variable is a convenience variable and is passed by value
+// ipcHandleTypeFlag variable is a convenience variable and is passed by value
-  // to individual requests.
+// to individual requests.
 #if defined(__linux__)
-CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+CUmemAllocationHandleType ipcHandleTypeFlag =
    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #else
 CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
 #endif
@ -53,223 +54,248 @@ CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
 // Windows-specific LPSECURITYATTRIBUTES
 void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) {
 #if defined(__linux__)
-    return;
+  return;
 #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)";
+  static const char sddl[] = "D:P(OA;;GARCSDWDWOCCDCLCSWLODTWPRPCRFA;;;WD)";
-    static OBJECT_ATTRIBUTES objAttributes;
+  static OBJECT_ATTRIBUTES objAttributes;
-    static bool objAttributesConfigured = false;
+  static bool objAttributesConfigured = false;
-    if (!objAttributesConfigured) {
+  if (!objAttributesConfigured) {
-        PSECURITY_DESCRIPTOR secDesc;
+    PSECURITY_DESCRIPTOR secDesc;
-        BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(
+    BOOL result = ConvertStringSecurityDescriptorToSecurityDescriptorA(
-            sddl, SDDL_REVISION_1, &secDesc, NULL);
+        sddl, SDDL_REVISION_1, &secDesc, NULL);
-        if (result == 0) {
+    if (result == 0) {
-            printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n",
+      printf("IPC failure: getDefaultSecurityDescriptor Failed! (%d)\n",
-                GetLastError());
+             GetLastError());
        }
        InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc);
        objAttributesConfigured = true;
    }
-    prop->win32HandleMetaData = &objAttributes;
+    InitializeObjectAttributes(&objAttributes, NULL, 0, NULL, secDesc);
-    return;
+
    objAttributesConfigured = true;
  }
  prop->win32HandleMetaData = &objAttributes;
  return;
 #endif
 }
-__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, float *numPointsInCircle, unsigned int numPoints, float time)
+__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle,
-{
+                                   float *numPointsInCircle,
-    const size_t stride = gridDim.x * blockDim.x;
+                                   unsigned int numPoints, float time) {
-    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
-    float count = 0.0f;
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  float count = 0.0f;
-    curandState rgnState;
+  curandState rgnState;
-    curand_init((unsigned long long)time, tid, 0, &rgnState);
+  curand_init((unsigned long long)time, tid, 0, &rgnState);
-    for (; tid < numPoints; tid += stride) {
+  for (; tid < numPoints; tid += stride) {
-        float x = curand_uniform(&rgnState);
+    float x = curand_uniform(&rgnState);
-        float y = curand_uniform(&rgnState);
+    float y = curand_uniform(&rgnState);
-        x = (2.0f * x) - 1.0f;
+    x = (2.0f * x) - 1.0f;
-        y = (2.0f * y) - 1.0f;
+    y = (2.0f * y) - 1.0f;
-        xyVector[tid][0] = x;
+    xyVector[tid][0] = x;
-        xyVector[tid][1] = y;
+    xyVector[tid][1] = y;
-        // Compute the distance of this point form the center(0, 0)
+    // Compute the distance of this point form the center(0, 0)
-        float dist = sqrtf((x*x) + (y*y));
+    float dist = sqrtf((x * x) + (y * y));
-        // If distance is less than the radius of the unit circle, the point lies in the circle.
+    // If distance is less than the radius of the unit circle, the point lies in
-        pointsInsideCircle[tid] = (dist <= 1.0f);
+    // the circle.
-        count += (dist <= 1.0f);
+    pointsInsideCircle[tid] = (dist <= 1.0f);
-    }
+    count += (dist <= 1.0f);
-    atomicAdd(numPointsInCircle, count);
+  }
  atomicAdd(numPointsInCircle, count);
 }
-MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points) :
+MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points)
-    m_xyVector(nullptr),
+    : m_xyVector(nullptr),
-    m_pointsInsideCircle(nullptr),
+      m_pointsInsideCircle(nullptr),
-    m_totalPointsInsideCircle(0),
+      m_totalPointsInsideCircle(0),
-    m_totalPointsSimulated(0),
+      m_totalPointsSimulated(0),
-    m_numPoints(num_points)
+      m_numPoints(num_points) {}
-{
+
 MonteCarloPiSimulation::~MonteCarloPiSimulation() {
  if (m_numPointsInCircle) {
    checkCudaErrors(cudaFree(m_numPointsInCircle));
    m_numPointsInCircle = nullptr;
  }
  if (m_hostNumPointsInCircle) {
    checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle));
    m_hostNumPointsInCircle = nullptr;
  }
  cleanupSimulationAllocations();
 }
-MonteCarloPiSimulation::~MonteCarloPiSimulation()
+void MonteCarloPiSimulation::initSimulation(int cudaDevice,
-{
+                                            cudaStream_t stream) {
-    if (m_numPointsInCircle) {
+  m_cudaDevice = cudaDevice;
-        checkCudaErrors(cudaFree(m_numPointsInCircle));
+  getIdealExecutionConfiguration();
        m_numPointsInCircle = nullptr;
    }
    if (m_hostNumPointsInCircle) {
        checkCudaErrors(cudaFreeHost(m_hostNumPointsInCircle));
        m_hostNumPointsInCircle = nullptr;
    }
-    cleanupSimulationAllocations();
+  // Allocate a position buffer that contains random location of the points in
  // XY cartesian plane.
  // Allocate a bitmap buffer which holds information of whether a point in the
  // position buffer is inside the unit circle or not.
  setupSimulationAllocations();
  checkCudaErrors(
      cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
  checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle,
                                 sizeof(*m_hostNumPointsInCircle)));
 }
-void MonteCarloPiSimulation::initSimulation(int cudaDevice, cudaStream_t stream)
+void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) {
-{
+  checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0,
-    m_cudaDevice = cudaDevice;
+                                  sizeof(*m_numPointsInCircle), stream));
    getIdealExecutionConfiguration();
-    // Allocate a position buffer that contains random location of the points in XY cartesian plane.
+  monte_carlo_kernel<<<m_blocks, m_threads, 0, stream>>>(
-    // Allocate a bitmap buffer which holds information of whether a point in the position buffer is inside the unit circle or not.
+      m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
-    setupSimulationAllocations();
+  getLastCudaError("Failed to launch CUDA simulation");
-    checkCudaErrors(cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
+  checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle,
-    checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, sizeof(*m_hostNumPointsInCircle)));
+                                  sizeof(*m_numPointsInCircle),
                                  cudaMemcpyDeviceToHost, stream));
  // Queue up a stream callback to compute and print the PI value.
  checkCudaErrors(
      cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
 }
-void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream)
+void MonteCarloPiSimulation::computePiCallback(void *args) {
-{
+  MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
-
+  cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
-    checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, sizeof(*m_numPointsInCircle), stream));
+  cbData->m_totalPointsSimulated += cbData->m_numPoints;
-
+  double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle /
-    monte_carlo_kernel << < m_blocks, m_threads, 0, stream >> > (m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
+                          (double)cbData->m_totalPointsSimulated);
-    getLastCudaError("Failed to launch CUDA simulation");
+  printf("Approximate Pi value for %zd data points: %lf \n",
-
+         cbData->m_totalPointsSimulated, piValue);
    checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle, sizeof(*m_numPointsInCircle), cudaMemcpyDeviceToHost, stream));
    // Queue up a stream callback to compute and print the PI value.
    checkCudaErrors(cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
 }
-void MonteCarloPiSimulation::computePiCallback(void *args)
+void MonteCarloPiSimulation::getIdealExecutionConfiguration() {
-{
+  int warpSize = 0;
-    MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
+  int multiProcessorCount = 0;
-    cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
+
-    cbData->m_totalPointsSimulated += cbData->m_numPoints;
+  checkCudaErrors(cudaSetDevice(m_cudaDevice));
-    double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / (double)cbData->m_totalPointsSimulated);
+  checkCudaErrors(
-    printf("Approximate Pi value for %zd data points: %lf \n", cbData->m_totalPointsSimulated, piValue);
+      cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
  // We don't need large block sizes, since there's not much inter-thread
  // communication
  m_threads = warpSize;
  // Use the occupancy calculator and fill the gpu as best as we can
  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
      &m_blocks, monte_carlo_kernel, warpSize, 0));
  checkCudaErrors(cudaDeviceGetAttribute(
      &multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
  m_blocks *= multiProcessorCount;
  // Go ahead and the clamp the blocks to the minimum needed for this
  // height/width
  m_blocks =
      std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
 }
-void MonteCarloPiSimulation::getIdealExecutionConfiguration()
+void MonteCarloPiSimulation::setupSimulationAllocations() {
-{
+  CUdeviceptr d_ptr = 0U;
-    int warpSize = 0;
+  size_t granularity = 0;
-    int multiProcessorCount = 0;
+  CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;
-    checkCudaErrors(cudaSetDevice(m_cudaDevice));
+  CUmemAllocationProp allocProp = {};
-    checkCudaErrors(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
+  allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  allocProp.location.id = m_cudaDevice;
  allocProp.win32HandleMetaData = NULL;
  allocProp.requestedHandleTypes = ipcHandleTypeFlag;
-    // We don't need large block sizes, since there's not much inter-thread communication
+  // Windows-specific LPSECURITYATTRIBUTES is required when
-    m_threads = warpSize;
+  // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope
  // of which exported allocations may be tranferred to other processes. For all
  // other handle types, pass NULL.
  getDefaultSecurityDescriptor(&allocProp);
-    // Use the occupancy calculator and fill the gpu as best as we can
+  // Get the recommended granularity for m_cudaDevice.
-    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, monte_carlo_kernel, warpSize, 0));
+  checkCudaErrors(cuMemGetAllocationGranularity(
      &granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
-    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
+  size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
-    m_blocks *= multiProcessorCount;
+  size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);
-    // Go ahead and the clamp the blocks to the minimum needed for this height/width
+  size_t xyPositionSize =
-    m_blocks = std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
+      ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
  size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
  m_totalAllocationSize = (xyPositionSize + inCircleSize);
  // Reserve the required contiguous VA space for the allocations
  checkCudaErrors(
      cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
  // Create the allocations as a pinned allocation on this device.
  // Create an allocation to store all the positions of points on the xy plane
  // and a second allocation which stores information if the corresponding
  // position is inside the unit circle or not.
  checkCudaErrors(
      cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
  checkCudaErrors(
      cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
  // Export the allocation to a platform-specific handle. The type of handle
  // requested here must match the requestedHandleTypes field in the prop
  // structure passed to cuMemCreate. The handle obtained here will be passed to
  // vulkan to import the allocation.
  checkCudaErrors(cuMemExportToShareableHandle(
      (void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
  checkCudaErrors(
      cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle,
                                   cudaInCircleHandle, ipcHandleTypeFlag, 0));
  CUdeviceptr va_position = d_ptr;
  CUdeviceptr va_InCircle = va_position + xyPositionSize;
  m_pointsInsideCircle = (float *)va_InCircle;
  m_xyVector = (vec2 *)va_position;
  // Assign the chunk to the appropriate VA range
  checkCudaErrors(
      cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
  checkCudaErrors(
      cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
  // Release the handles for the allocation. Since the allocation is currently
  // mapped to a VA range with a previous call to cuMemMap the actual freeing of
  // memory allocation will happen on an eventual call to cuMemUnmap. Thus the
  // allocation will be kept live until it is unmapped.
  checkCudaErrors(cuMemRelease(cudaPositionHandle));
  checkCudaErrors(cuMemRelease(cudaInCircleHandle));
  CUmemAccessDesc accessDescriptor = {};
  accessDescriptor.location.id = m_cudaDevice;
  accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
  // Apply the access descriptor to the whole VA range. Essentially enables
  // Read-Write access to the range.
  checkCudaErrors(
      cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
 }
-void MonteCarloPiSimulation::setupSimulationAllocations()
+void MonteCarloPiSimulation::cleanupSimulationAllocations() {
-{
+  if (m_xyVector && m_pointsInsideCircle) {
-    CUdeviceptr d_ptr = 0U;
+    // Unmap the mapped virtual memory region
-    size_t granularity = 0;
+    // Since the handles to the mapped backing stores have already been released
-    CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;
+    // by cuMemRelease, and these are the only/last mappings referencing them,
    // The backing stores will be freed.
    checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize));
-    CUmemAllocationProp allocProp = { };
+    checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle));
-    allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));
    allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    allocProp.location.id = m_cudaDevice;
    allocProp.win32HandleMetaData = NULL;
    allocProp.requestedHandleTypes = ipcHandleTypeFlag;
-    // Windows-specific LPSECURITYATTRIBUTES is required when
+    // Free the virtual address region.
-    // CU_MEM_HANDLE_TYPE_WIN32 is used. The security attribute defines the scope
+    checkCudaErrors(
-    // of which exported allocations may be tranferred to other processes. For all
+        cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));
    // other handle types, pass NULL.
    getDefaultSecurityDescriptor(&allocProp);
-    // Get the recommended granularity for m_cudaDevice.
+    m_xyVector = nullptr;
-    checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+    m_pointsInsideCircle = nullptr;
-
+  }
    size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
    size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);
    size_t xyPositionSize = ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
    size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
    m_totalAllocationSize = (xyPositionSize + inCircleSize);
    // Reserve the required contiguous VA space for the allocations
    checkCudaErrors(cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
    // Create the allocations as a pinned allocation on this device.
    // Create an allocation to store all the positions of points on the xy plane and a second
    // allocation which stores information if the corresponding position is inside the unit circle or not.
    checkCudaErrors(cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
    checkCudaErrors(cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
    // Export the allocation to a platform-specific handle. The type of handle
    // requested here must match the requestedHandleTypes field in the prop
    // structure passed to cuMemCreate. The handle obtained here will be passed to vulkan
    // to import the allocation.
    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, cudaInCircleHandle, ipcHandleTypeFlag, 0));
    CUdeviceptr va_position = d_ptr;
    CUdeviceptr va_InCircle = va_position + xyPositionSize;
    m_pointsInsideCircle = (float *)va_InCircle;
    m_xyVector = (vec2 *)va_position;
    // Assign the chunk to the appropriate VA range
    checkCudaErrors(cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
    checkCudaErrors(cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
    // Release the handles for the allocation. Since the allocation is currently mapped to a VA range
    // with a previous call to cuMemMap the actual freeing of memory allocation will happen on an eventual call to
    // cuMemUnmap. Thus the allocation will be kept live until it is unmapped.
    checkCudaErrors(cuMemRelease(cudaPositionHandle));
    checkCudaErrors(cuMemRelease(cudaInCircleHandle));
    CUmemAccessDesc accessDescriptor = {};
    accessDescriptor.location.id = m_cudaDevice;
    accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    // Apply the access descriptor to the whole VA range. Essentially enables Read-Write access to the range.
    checkCudaErrors(cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
 }
 void MonteCarloPiSimulation::cleanupSimulationAllocations()
 {
    if (m_xyVector && m_pointsInsideCircle) {
        // Unmap the mapped virtual memory region
        // Since the handles to the mapped backing stores have already been released
        // by cuMemRelease, and these are the only/last mappings referencing them,
        // The backing stores will be freed.
        checkCudaErrors(cuMemUnmap((CUdeviceptr)m_xyVector, m_totalAllocationSize));
        checkIpcErrors(ipcCloseShareableHandle(m_posShareableHandle));
        checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));
        // Free the virtual address region.
        checkCudaErrors(cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));
        m_xyVector = nullptr;
        m_pointsInsideCircle = nullptr;
    }
 }
--- a/Samples/simpleVulkanMMAP/MonteCarloPi.h
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.h
@ -39,62 +39,57 @@
 typedef float vec2[2];
-class MonteCarloPiSimulation
+class MonteCarloPiSimulation {
-{
+  size_t m_numPoints;
    size_t m_numPoints;
-    // Pointers to Cuda allocated buffers which are imported and used by vulkan as vertex buffer
+  // Pointers to Cuda allocated buffers which are imported and used by vulkan as
-    vec2 *m_xyVector;
+  // vertex buffer
-    float *m_pointsInsideCircle;
+  vec2 *m_xyVector;
  float *m_pointsInsideCircle;
-    // Pointers to device and host allocated memories storing number of points that are inside the unit circle
+  // Pointers to device and host allocated memories storing number of points
-    float *m_numPointsInCircle;
+  // that are inside the unit circle
-    float *m_hostNumPointsInCircle;
+  float *m_numPointsInCircle;
  float *m_hostNumPointsInCircle;
-    int m_blocks, m_threads;
+  int m_blocks, m_threads;
-    // Total size of allocations created by cuMemMap Apis. This size is the sum of sizes of
+  // Total size of allocations created by cuMemMap Apis. This size is the sum of
-    // m_xyVector and m_pointsInsideCircle buffers.
+  // sizes of m_xyVector and m_pointsInsideCircle buffers.
-    size_t m_totalAllocationSize;
+  size_t m_totalAllocationSize;
-    // Shareable Handles(a file descriptor on Linux and NT Handle on Windows), used for sharing cuda
+  // Shareable Handles(a file descriptor on Linux and NT Handle on Windows),
-    // allocated memory with Vulkan
+  // used for sharing cuda
-    ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;
+  // allocated memory with Vulkan
  ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;
-    // Cuda Device corresponding to the Vulkan Physical device
+  // Cuda Device corresponding to the Vulkan Physical device
-    int m_cudaDevice;
+  int m_cudaDevice;
-    // Track and accumulate total points that have been simulated since start of the sample.
+  // Track and accumulate total points that have been simulated since start of
-    // The idea is to get a closer approximation to PI with time.
+  // the sample. The idea is to get a closer approximation to PI with time.
-    size_t m_totalPointsInsideCircle;
+  size_t m_totalPointsInsideCircle;
-    size_t m_totalPointsSimulated;
+  size_t m_totalPointsSimulated;
-    void setupSimulationAllocations();
+  void setupSimulationAllocations();
-    void cleanupSimulationAllocations();
+  void cleanupSimulationAllocations();
-    void getIdealExecutionConfiguration();
+  void getIdealExecutionConfiguration();
-public:
+ public:
-    MonteCarloPiSimulation(size_t num_points);
+  MonteCarloPiSimulation(size_t num_points);
-    ~MonteCarloPiSimulation();
+  ~MonteCarloPiSimulation();
-    void initSimulation(int cudaDevice, cudaStream_t stream = 0);
+  void initSimulation(int cudaDevice, cudaStream_t stream = 0);
-    void stepSimulation(float time, cudaStream_t stream = 0);
+  void stepSimulation(float time, cudaStream_t stream = 0);
-    static void computePiCallback(void *args);
+  static void computePiCallback(void *args);
-    size_t getNumPoints() const {
+  size_t getNumPoints() const { return m_numPoints; }
        return m_numPoints;
    }
-    float getNumPointsInCircle() const {
+  float getNumPointsInCircle() const { return *m_hostNumPointsInCircle; }
        return *m_hostNumPointsInCircle;
    }
    ShareableHandle &getPositionShareableHandle() {
        return m_posShareableHandle;
    }
    ShareableHandle &getInCircleShareableHandle() {
        return m_inCircleShareableHandle;
    }
  ShareableHandle &getPositionShareableHandle() { return m_posShareableHandle; }
  ShareableHandle &getInCircleShareableHandle() {
    return m_inCircleShareableHandle;
  }
 };
-#endif // __PISIM_H__
+#endif  // __PISIM_H__
--- a/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
--- a/Samples/simpleVulkanMMAP/VulkanBaseApp.h
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.h
@ -40,101 +40,119 @@
 struct GLFWwindow;
-class VulkanBaseApp
+class VulkanBaseApp {
-{
+ public:
-public:
+  VulkanBaseApp(const std::string& appName, bool enableValidation = false);
-    VulkanBaseApp(const std::string& appName, bool enableValidation = false);
+  static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
-    static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
+  static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
-    static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
+  virtual ~VulkanBaseApp();
-    virtual ~VulkanBaseApp();
+  void init();
-    void init();
+  void* getMemHandle(VkDeviceMemory memory,
-    void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType);
+                     VkExternalMemoryHandleTypeFlagBits handleType);
-    void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+  void* getSemaphoreHandle(VkSemaphore semaphore,
-    bool isVkPhysicalDeviceUuid(void *Uuid);
+                           VkExternalSemaphoreHandleTypeFlagBits handleType);
-    void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+  bool isVkPhysicalDeviceUuid(void* Uuid);
-    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+  void createExternalSemaphore(
-    void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+      VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory);
+  void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
-    void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
+                    VkMemoryPropertyFlags properties, VkBuffer& buffer,
-    VkCommandBuffer beginSingleTimeCommands();
+                    VkDeviceMemory& bufferMemory);
-    void endSingleTimeCommands(VkCommandBuffer commandBuffer);
+  void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
-    void mainLoop();
+                            VkMemoryPropertyFlags properties,
-protected:
+                            VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
-    const std::string m_appName;
+                            VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    const bool m_enableValidation;
+  void importExternalBuffer(void* handle,
-    VkInstance m_instance;
+                            VkExternalMemoryHandleTypeFlagBits handleType,
-    VkDebugUtilsMessengerEXT m_debugMessenger;
+                            size_t size, VkBufferUsageFlags usage,
-    VkSurfaceKHR m_surface;
+                            VkMemoryPropertyFlags properties, VkBuffer& buffer,
-    VkPhysicalDevice m_physicalDevice;
+                            VkDeviceMemory& memory);
-    uint8_t m_deviceUUID[VK_UUID_SIZE];
+  void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
-    VkDevice m_device;
+  VkCommandBuffer beginSingleTimeCommands();
-    VkQueue m_graphicsQueue;
+  void endSingleTimeCommands(VkCommandBuffer commandBuffer);
-    VkQueue m_presentQueue;
+  void mainLoop();
    VkSwapchainKHR m_swapChain;
    std::vector<VkImage> m_swapChainImages;
    VkFormat m_swapChainFormat;
    VkExtent2D m_swapChainExtent;
    std::vector<VkImageView> m_swapChainImageViews;
    std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
    VkRenderPass m_renderPass;
    VkPipelineLayout m_pipelineLayout;
    VkPipeline m_graphicsPipeline;
    std::vector<VkFramebuffer> m_swapChainFramebuffers;
    VkCommandPool m_commandPool;
    std::vector<VkCommandBuffer> m_commandBuffers;
    std::vector<VkSemaphore> m_imageAvailableSemaphores;
    std::vector<VkSemaphore> m_renderFinishedSemaphores;
    std::vector<VkFence> m_inFlightFences;
    std::vector<VkBuffer> m_uniformBuffers;
    std::vector<VkDeviceMemory> m_uniformMemory;
    VkDescriptorSetLayout m_descriptorSetLayout;
    VkDescriptorPool m_descriptorPool;
    std::vector<VkDescriptorSet> m_descriptorSets;
-    VkImage m_depthImage;
+ protected:
-    VkDeviceMemory m_depthImageMemory;
+  const std::string m_appName;
-    VkImageView m_depthImageView;
+  const bool m_enableValidation;
-    size_t m_currentFrame;
+  VkInstance m_instance;
-    bool m_framebufferResized;
+  VkDebugUtilsMessengerEXT m_debugMessenger;
  VkSurfaceKHR m_surface;
  VkPhysicalDevice m_physicalDevice;
  uint8_t m_deviceUUID[VK_UUID_SIZE];
  VkDevice m_device;
  VkQueue m_graphicsQueue;
  VkQueue m_presentQueue;
  VkSwapchainKHR m_swapChain;
  std::vector<VkImage> m_swapChainImages;
  VkFormat m_swapChainFormat;
  VkExtent2D m_swapChainExtent;
  std::vector<VkImageView> m_swapChainImageViews;
  std::vector<std::pair<VkShaderStageFlagBits, std::string> > m_shaderFiles;
  VkRenderPass m_renderPass;
  VkPipelineLayout m_pipelineLayout;
  VkPipeline m_graphicsPipeline;
  std::vector<VkFramebuffer> m_swapChainFramebuffers;
  VkCommandPool m_commandPool;
  std::vector<VkCommandBuffer> m_commandBuffers;
  std::vector<VkSemaphore> m_imageAvailableSemaphores;
  std::vector<VkSemaphore> m_renderFinishedSemaphores;
  std::vector<VkFence> m_inFlightFences;
  std::vector<VkBuffer> m_uniformBuffers;
  std::vector<VkDeviceMemory> m_uniformMemory;
  VkDescriptorSetLayout m_descriptorSetLayout;
  VkDescriptorPool m_descriptorPool;
  std::vector<VkDescriptorSet> m_descriptorSets;
-    virtual void initVulkanApp() {}
+  VkImage m_depthImage;
-    virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
+  VkDeviceMemory m_depthImageMemory;
-    virtual std::vector<const char *> getRequiredExtensions() const;
+  VkImageView m_depthImageView;
-    virtual std::vector<const char *> getRequiredDeviceExtensions() const;
+  size_t m_currentFrame;
-    virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc);
+  bool m_framebufferResized;
    virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info);
    virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const;
    virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
    virtual VkDeviceSize getUniformSize() const;
    virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
    virtual void drawFrame();
 private:
    GLFWwindow *m_window;
-    void initWindow();
+  virtual void initVulkanApp() {}
-    void initVulkan();
+  virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
-    void createInstance();
+  virtual std::vector<const char*> getRequiredExtensions() const;
-    void createSurface();
+  virtual std::vector<const char*> getRequiredDeviceExtensions() const;
-    void createDevice();
+  virtual void getVertexDescriptions(
-    void createSwapChain();
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
-    void createImageViews();
+      std::vector<VkVertexInputAttributeDescription>& attribDesc);
-    void createRenderPass();
+  virtual void getAssemblyStateInfo(
-    void createDescriptorSetLayout();
+      VkPipelineInputAssemblyStateCreateInfo& info);
-    void createGraphicsPipeline();
+  virtual void getWaitFrameSemaphores(
-    void createFramebuffers();
+      std::vector<VkSemaphore>& wait,
-    void createCommandPool();
+      std::vector<VkPipelineStageFlags>& waitStages) const;
-    void createDepthResources();
+  virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
-    void createUniformBuffers();
+  virtual VkDeviceSize getUniformSize() const;
-    void createDescriptorPool();
+  virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
-    void createDescriptorSets();
+  virtual void drawFrame();
    void createCommandBuffers();
    void createSyncObjects();
-    void cleanupSwapChain();
+ private:
-    void recreateSwapChain();
+  GLFWwindow* m_window;
-    bool isSuitableDevice(VkPhysicalDevice dev) const;
+  void initWindow();
-    static void resizeCallback(GLFWwindow *window, int width, int height);
+  void initVulkan();
  void createInstance();
  void createSurface();
  void createDevice();
  void createSwapChain();
  void createImageViews();
  void createRenderPass();
  void createDescriptorSetLayout();
  void createGraphicsPipeline();
  void createFramebuffers();
  void createCommandPool();
  void createDepthResources();
  void createUniformBuffers();
  void createDescriptorPool();
  void createDescriptorSets();
  void createCommandBuffers();
  void createSyncObjects();
  void cleanupSwapChain();
  void recreateSwapChain();
  bool isSuitableDevice(VkPhysicalDevice dev) const;
  static void resizeCallback(GLFWwindow* window, int width, int height);
 };
 void readFile(std::istream& s, std::vector<char>& data);
--- a/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
+++ b/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
@ -35,41 +35,48 @@
 #include <helper_cuda.h>
 bool isDeviceCompatible(void *Uuid, size_t size) {
  int cudaDevice = cudaInvalidDeviceId;
  int deviceCount;
  checkCudaErrors(cudaGetDeviceCount(&deviceCount));
-    int cudaDevice = cudaInvalidDeviceId;
+  for (int i = 0; i < deviceCount; ++i) {
-    int deviceCount;
+    cudaDeviceProp devProp = {};
-    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+    checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
-
+    if (!memcmp(&devProp.uuid, Uuid, size)) {
-    for (int i = 0; i < deviceCount; ++i) {
+      cudaDevice = i;
-        cudaDeviceProp devProp = { };
+      break;
        checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
        if (!memcmp(&devProp.uuid, Uuid, size)) {
            cudaDevice = i;
            break;
        }
    }
    if (cudaDevice == cudaInvalidDeviceId) {
        return false;
    }
  }
  if (cudaDevice == cudaInvalidDeviceId) {
    return false;
  }
-    int deviceSupportsHandle = 0;
+  int deviceSupportsHandle = 0;
-    int attributeVal = 0;
+  int attributeVal = 0;
-    int deviceComputeMode = 0;
+  int deviceComputeMode = 0;
-    checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
-    checkCudaErrors(cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cudaDevice));
+      &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
  checkCudaErrors(cuDeviceGetAttribute(
      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
      cudaDevice));
 #if defined(__linux__)
-    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
      &deviceSupportsHandle,
      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED,
      cudaDevice));
 #else
-    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
      &deviceSupportsHandle,
      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
 #endif
-    if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || !deviceSupportsHandle) {
+  if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal ||
-        return false;
+      !deviceSupportsHandle) {
-    }
+    return false;
-    return true;
+  }
  return true;
 }
-#endif // __VKCUDA_H__
+#endif  // __VKCUDA_H__
--- a/Samples/simpleVulkanMMAP/frag.spv
+++ b/Samples/simpleVulkanMMAP/frag.spv
--- a/Samples/simpleVulkanMMAP/main.cpp
+++ b/Samples/simpleVulkanMMAP/main.cpp
@ -25,11 +25,12 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
- /*
+/*
-  * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
+ * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
-  * Allocating device memory and updating values in those allocations are performed by CUDA
+ * Allocating device memory and updating values in those allocations are
-  * and the contents of the allocation are visualized by Vulkan.
+ * performed by CUDA and the contents of the allocation are visualized by
-  */
+ * Vulkan.
 */
 #include "VulkanBaseApp.h"
@ -55,25 +56,23 @@
 std::string execution_path;
-class VulkanCudaPi : public VulkanBaseApp
+class VulkanCudaPi : public VulkanBaseApp {
-{
+  typedef struct UniformBufferObject_st { float frame; } UniformBufferObject;
    typedef struct UniformBufferObject_st {
        float frame;
    } UniformBufferObject;
-    VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
+  VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
-    VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
+  VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
-    VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
+  VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore;
-    MonteCarloPiSimulation m_sim;
+  MonteCarloPiSimulation m_sim;
-    UniformBufferObject m_ubo;
+  UniformBufferObject m_ubo;
-    cudaStream_t m_stream;
+  cudaStream_t m_stream;
-    cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
+  cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore;
-    using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
-    chrono_tp m_lastTime;
+  chrono_tp m_lastTime;
-    size_t m_lastFrame;
+  size_t m_lastFrame;
-public:
+
-    VulkanCudaPi(size_t num_points) :
+ public:
-        VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
+  VulkanCudaPi(size_t num_points)
      : VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
        m_inCircleBuffer(VK_NULL_HANDLE),
        m_xyPositionBuffer(VK_NULL_HANDLE),
        m_inCircleMemory(VK_NULL_HANDLE),
@ -86,232 +85,268 @@ public:
        m_cudaWaitSemaphore(),
        m_cudaSignalSemaphore(),
        m_lastFrame(0) {
    // Add our compiled vulkan shader files
    char* vertex_shader_path =
        sdkFindFilePath("vert.spv", execution_path.c_str());
    char* fragment_shader_path =
        sdkFindFilePath("frag.spv", execution_path.c_str());
    m_shaderFiles.push_back(
        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
    m_shaderFiles.push_back(
        std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
  }
-        // Add our compiled vulkan shader files
+  ~VulkanCudaPi() {
-        char* vertex_shader_path = sdkFindFilePath("montecarlo.vert", execution_path.c_str());
+    if (m_stream) {
-        char* fragment_shader_path = sdkFindFilePath("montecarlo.frag", execution_path.c_str());
+      // Make sure there's no pending work before we start tearing down
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
+      checkCudaErrors(cudaStreamSynchronize(m_stream));
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
+      checkCudaErrors(cudaStreamDestroy(m_stream));
    }
-    ~VulkanCudaPi() {
+    if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
-        if (m_stream) {
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
-            // Make sure there's no pending work before we start tearing down
+      vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
-            checkCudaErrors(cudaStreamSynchronize(m_stream));
+    }
-            checkCudaErrors(cudaStreamDestroy(m_stream));
+    if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
-        }
+      checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
      vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
    }
    if (m_xyPositionBuffer != VK_NULL_HANDLE) {
      vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr);
    }
    if (m_xyPositionMemory != VK_NULL_HANDLE) {
      vkFreeMemory(m_device, m_xyPositionMemory, nullptr);
    }
    if (m_inCircleBuffer != VK_NULL_HANDLE) {
      vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr);
    }
    if (m_inCircleMemory != VK_NULL_HANDLE) {
      vkFreeMemory(m_device, m_inCircleMemory, nullptr);
    }
  }
-        if (m_vkSignalSemaphore != VK_NULL_HANDLE) {
+  void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
-            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore));
+    VkBuffer vertexBuffers[] = {m_inCircleBuffer, m_xyPositionBuffer};
-            vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr);
+    VkDeviceSize offsets[] = {0, 0};
-        }
+    vkCmdBindVertexBuffers(commandBuffer, 0,
-        if (m_vkWaitSemaphore != VK_NULL_HANDLE) {
+                           sizeof(vertexBuffers) / sizeof(vertexBuffers[0]),
-            checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore));
+                           vertexBuffers, offsets);
-            vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr);
+    vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
-        }
+  }
-        if (m_xyPositionBuffer != VK_NULL_HANDLE) {
+
-            vkDestroyBuffer(m_device, m_xyPositionBuffer, nullptr);
+  void getVertexDescriptions(
-        }
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
-        if (m_xyPositionMemory != VK_NULL_HANDLE) {
+      std::vector<VkVertexInputAttributeDescription>& attribDesc) {
-            vkFreeMemory(m_device, m_xyPositionMemory, nullptr);
+    bindingDesc.resize(2);
-        }
+    attribDesc.resize(2);
-        if (m_inCircleBuffer != VK_NULL_HANDLE) {
+
-            vkDestroyBuffer(m_device, m_inCircleBuffer, nullptr);
+    bindingDesc[0].binding = 0;
-        }
+    bindingDesc[0].stride = sizeof(float);
-        if (m_inCircleMemory != VK_NULL_HANDLE) {
+    bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
-            vkFreeMemory(m_device, m_inCircleMemory, nullptr);
+
-        }
+    bindingDesc[1].binding = 1;
    bindingDesc[1].stride = sizeof(vec2);
    bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
    attribDesc[0].binding = 0;
    attribDesc[0].location = 0;
    attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
    attribDesc[0].offset = 0;
    attribDesc[1].binding = 1;
    attribDesc[1].location = 1;
    attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
    attribDesc[1].offset = 0;
  }
  void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
    info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
    info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
    info.primitiveRestartEnable = VK_FALSE;
  }
  void getWaitFrameSemaphores(
      std::vector<VkSemaphore>& wait,
      std::vector<VkPipelineStageFlags>& waitStages) const {
    if (m_currentFrame != 0) {
      // Have vulkan wait until cuda is done with the vertex buffer before
      // rendering
      // We don't do this on the first frame, as the wait semaphore hasn't been
      // initialized yet
      wait.push_back(m_vkWaitSemaphore);
      // We want to wait until all the pipeline commands are complete before
      // letting cuda work
      waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
    }
  }
  void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
    // Add this semaphore for vulkan to signal once the vertex buffer is ready
    // for cuda to modify
    signal.push_back(m_vkSignalSemaphore);
  }
  void initVulkanApp() {
    const size_t nVerts = m_sim.getNumPoints();
    // Obtain cuda device id for the device corresponding to the Vulkan physical
    // device
    int deviceCount;
    int cudaDevice = cudaInvalidDeviceId;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
    for (int dev = 0; dev < deviceCount; ++dev) {
      cudaDeviceProp devProp = {};
      checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
      if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
        cudaDevice = dev;
        break;
      }
    }
    if (cudaDevice == cudaInvalidDeviceId) {
      throw std::runtime_error("No Suitable device found!");
    }
-    void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
+    // On the corresponding cuda device, create the cuda stream we'll using
-        VkBuffer vertexBuffers[] = { m_inCircleBuffer, m_xyPositionBuffer };
+    checkCudaErrors(cudaSetDevice(cudaDevice));
-        VkDeviceSize offsets[] = { 0, 0 };
+    checkCudaErrors(
-        vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets);
+        cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
-        vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
+    m_sim.initSimulation(cudaDevice, m_stream);
    importExternalBuffer(
        (void*)(uintptr_t)m_sim.getPositionShareableHandle(),
        getDefaultMemHandleType(), nVerts * sizeof(vec2),
        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer,
        m_xyPositionMemory);
    importExternalBuffer(
        (void*)(uintptr_t)m_sim.getInCircleShareableHandle(),
        getDefaultMemHandleType(), nVerts * sizeof(float),
        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer,
        m_inCircleMemory);
    // Create the semaphore vulkan will signal when it's done with the vertex
    // buffer
    createExternalSemaphore(m_vkSignalSemaphore,
                            getDefaultSemaphoreHandleType());
    // Create the semaphore vulkan will wait for before using the vertex buffer
    createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
    // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
    importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore,
                                getDefaultSemaphoreHandleType());
    // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
    importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore,
                                getDefaultSemaphoreHandleType());
  }
  void importCudaExternalSemaphore(
      cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem,
      VkExternalSemaphoreHandleTypeFlagBits handleType) {
    cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
    if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
      externalSemaphoreHandleDesc.type =
          cudaExternalSemaphoreHandleTypeOpaqueWin32;
    } else if (handleType &
               VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
      externalSemaphoreHandleDesc.type =
          cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
    } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
      externalSemaphoreHandleDesc.type =
          cudaExternalSemaphoreHandleTypeOpaqueFd;
    } else {
      throw std::runtime_error("Unknown handle type requested!");
    }
    void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) {
        bindingDesc.resize(2);
        attribDesc.resize(2);
        bindingDesc[0].binding = 0;
        bindingDesc[0].stride = sizeof(float);
        bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
        bindingDesc[1].binding = 1;
        bindingDesc[1].stride = sizeof(vec2);
        bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
        attribDesc[0].binding = 0;
        attribDesc[0].location = 0;
        attribDesc[0].format = VK_FORMAT_R32_SFLOAT;
        attribDesc[0].offset = 0;
        attribDesc[1].binding = 1;
        attribDesc[1].location = 1;
        attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT;
        attribDesc[1].offset = 0;
    }
    void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info) {
        info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
        info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
        info.primitiveRestartEnable = VK_FALSE;
    }
    void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const {
        if (m_currentFrame != 0) {
            // Have vulkan wait until cuda is done with the vertex buffer before rendering
            // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet
            wait.push_back(m_vkWaitSemaphore);
            // We want to wait until all the pipeline commands are complete before letting cuda work
            waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
        }
    }
    void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
        // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify
        signal.push_back(m_vkSignalSemaphore);
    }
    void initVulkanApp() {
        const size_t nVerts = m_sim.getNumPoints();
        // Obtain cuda device id for the device corresponding to the Vulkan physical device
        int deviceCount;
        int cudaDevice = cudaInvalidDeviceId;
        checkCudaErrors(cudaGetDeviceCount(&deviceCount));
        for (int dev = 0; dev < deviceCount; ++dev) {
            cudaDeviceProp devProp = { };
            checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
            if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
                cudaDevice = dev;
                break;
            }
        }
        if (cudaDevice == cudaInvalidDeviceId) {
            throw std::runtime_error("No Suitable device found!");
        }
        // On the corresponding cuda device, create the cuda stream we'll using
        checkCudaErrors(cudaSetDevice(cudaDevice));
        checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
        m_sim.initSimulation(cudaDevice, m_stream);
        importExternalBuffer((void *)(uintptr_t)m_sim.getPositionShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(vec2),
            VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer, m_xyPositionMemory);
        importExternalBuffer((void *)(uintptr_t)m_sim.getInCircleShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(float),
            VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer, m_inCircleMemory);
        // Create the semaphore vulkan will signal when it's done with the vertex buffer
        createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
        // Create the semaphore vulkan will wait for before using the vertex buffer
        createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
        // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
        importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
        // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
        importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
    }
    void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) {
        cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
        if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32;
        }
        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
        }
        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
        }
        else {
            throw std::runtime_error("Unknown handle type requested!");
        }
 #ifdef _WIN64
-        externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.win32.handle =
        (HANDLE)getSemaphoreHandle(vkSem, handleType);
 #else
-        externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.fd =
        (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
 #endif
-        externalSemaphoreHandleDesc.flags = 0;
+    externalSemaphoreHandleDesc.flags = 0;
-        checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+    checkCudaErrors(
-    }
+        cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
  }
-    VkDeviceSize getUniformSize() const {
+  VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); }
        return sizeof(UniformBufferObject);
    }
-    void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
+  void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
-        m_ubo.frame = (float)globalFrame;
+    m_ubo.frame = (float)globalFrame;
-        void *data;
+    void* data;
-        vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data);
+    vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0,
-        memcpy(data, &m_ubo, sizeof(m_ubo));
+                &data);
-        vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
+    memcpy(data, &m_ubo, sizeof(m_ubo));
-    }
+    vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
  }
-    std::vector<const char *> getRequiredExtensions() const {
+  std::vector<const char*> getRequiredExtensions() const {
-        std::vector<const char *> extensions;
+    std::vector<const char*> extensions;
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
-        return extensions;
+    extensions.push_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME);
-    }
+    extensions.push_back(
        VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
    return extensions;
  }
-    std::vector<const char *> getRequiredDeviceExtensions() const {
+  std::vector<const char*> getRequiredDeviceExtensions() const {
-        std::vector<const char *> extensions;
+    std::vector<const char*> extensions;
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
 #ifdef _WIN64
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME);
 #else
-        extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME);
-        extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME);
 #endif /* _WIN64 */
-        return extensions;
+    return extensions;
  }
  void drawFrame() {
    static chrono_tp startTime = std::chrono::high_resolution_clock::now();
    chrono_tp currentTime = std::chrono::high_resolution_clock::now();
    float time = std::chrono::duration<float, std::chrono::seconds::period>(
                     currentTime - startTime)
                     .count();
    if (m_currentFrame == 0) {
      m_lastTime = startTime;
    }
-    void drawFrame() {
+    cudaExternalSemaphoreWaitParams waitParams = {};
-        static chrono_tp startTime = std::chrono::high_resolution_clock::now();
+    waitParams.flags = 0;
    waitParams.params.fence.value = 0;
-        chrono_tp currentTime = std::chrono::high_resolution_clock::now();
+    cudaExternalSemaphoreSignalParams signalParams = {};
-        float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count();
+    signalParams.flags = 0;
    signalParams.params.fence.value = 0;
-        if (m_currentFrame == 0) {
+    // Have vulkan draw the current frame...
-            m_lastTime = startTime;
+    VulkanBaseApp::drawFrame();
-        }
+    // Wait for vulkan to complete it's work
    checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore,
                                                    &waitParams, 1, m_stream));
    // Now step the simulation
    m_sim.stepSimulation(time, m_stream);
-        cudaExternalSemaphoreWaitParams waitParams = {};
+    // Signal vulkan to continue with the updated buffers
-        waitParams.flags = 0;
+    checkCudaErrors(cudaSignalExternalSemaphoresAsync(
-        waitParams.params.fence.value = 0;
+        &m_cudaSignalSemaphore, &signalParams, 1, m_stream));
-
+  }
        cudaExternalSemaphoreSignalParams signalParams = {};
        signalParams.flags = 0;
        signalParams.params.fence.value = 0;
        // Have vulkan draw the current frame...
        VulkanBaseApp::drawFrame();
        // Wait for vulkan to complete it's work
        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream));
        // Now step the simulation
        m_sim.stepSimulation(time, m_stream);
        // Signal vulkan to continue with the updated buffers
        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
    }
 };
-int main(int argc, char **argv)
+int main(int argc, char** argv) {
-{
+  execution_path = argv[0];
-    execution_path = argv[0];
+  VulkanCudaPi app(NUM_SIMULATION_POINTS);
-    VulkanCudaPi app(NUM_SIMULATION_POINTS);
+  app.init();
-    app.init();
+  app.mainLoop();
-    app.mainLoop();
+  return 0;
    return 0;
 }
--- a/Samples/simpleVulkanMMAP/vert.spv
+++ b/Samples/simpleVulkanMMAP/vert.spv
--- a/Samples/vulkanImageCUDA/Build_instructions.txt
+++ b/Samples/vulkanImageCUDA/Build_instructions.txt
@ -19,8 +19,17 @@ For Linux:
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
 For Shader changes:
 -- Update the shader.vert and/or shader.frag shader source file as required
 -- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
    glslc shader.vert -o vert.spv
    glslc shader.frag -o frag.spv
 ** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/vulkanImageCUDA/frag.spv
+++ b/Samples/vulkanImageCUDA/frag.spv
--- a/Samples/vulkanImageCUDA/vert.spv
+++ b/Samples/vulkanImageCUDA/vert.spv
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA.cu
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA.cu
@ -69,7 +69,7 @@ const std::vector<const char*> validationLayers = {
    "VK_LAYER_KHRONOS_validation"};
 #ifdef NDEBUG
-const bool enableValidationLayers = false;
+const bool enableValidationLayers = true;
 #else
 const bool enableValidationLayers = false;
 #endif
@ -494,7 +494,7 @@ class vulkanImageCUDA {
  unsigned int* image_data = NULL;
  unsigned int imageWidth, imageHeight;
-  unsigned int mipLevels;
+  unsigned int mipLevels = 1;
  size_t totalImageMemSize;
  // CUDA objects
@ -630,6 +630,9 @@ class vulkanImageCUDA {
    vkDestroyBuffer(device, vertexBuffer, nullptr);
    vkFreeMemory(device, vertexBufferMemory, nullptr);
    vkDestroySemaphore(device, cudaUpdateVkSemaphore, nullptr);
    vkDestroySemaphore(device, vkUpdateCudaSemaphore, nullptr);
    for (size_t i = 0; i < MAX_FRAMES; i++) {
      vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr);
      vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr);
@ -686,7 +689,7 @@ class vulkanImageCUDA {
    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.pEngineName = "No Engine";
    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.apiVersion = VK_API_VERSION_1_0;
+    appInfo.apiVersion = VK_API_VERSION_1_1;
    VkInstanceCreateInfo createInfo = {};
    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
@ -905,6 +908,7 @@ class vulkanImageCUDA {
    }
    VkPhysicalDeviceFeatures deviceFeatures = {};
    deviceFeatures.samplerAnisotropy = VK_TRUE;
    VkDeviceCreateInfo createInfo = {};
    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
@ -1078,8 +1082,8 @@ class vulkanImageCUDA {
  }
  void createGraphicsPipeline() {
-    auto vertShaderCode = readFile("shader.vert");
+    auto vertShaderCode = readFile("vert.spv");
-    auto fragShaderCode = readFile("shader.frag");
+    auto fragShaderCode = readFile("frag.spv");
    VkShaderModule vertShaderModule = createShaderModule(vertShaderCode);
    VkShaderModule fragShaderModule = createShaderModule(fragShaderCode);
@ -1268,7 +1272,7 @@ class vulkanImageCUDA {
    // VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT
    createImage(
-        imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UINT,
+        imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UNORM,
        VK_IMAGE_TILING_OPTIMAL,
        VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
            VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
@ -1280,9 +1284,6 @@ class vulkanImageCUDA {
    copyBufferToImage(stagingBuffer, textureImage,
                      static_cast<uint32_t>(imageWidth),
                      static_cast<uint32_t>(imageHeight));
    transitionImageLayout(textureImage, VK_FORMAT_R8G8B8A8_UINT,
                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                          VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
    vkDestroyBuffer(device, stagingBuffer, nullptr);
    vkFreeMemory(device, stagingBufferMemory, nullptr);
@ -1523,8 +1524,13 @@ class vulkanImageCUDA {
    vkExternalMemImageCreateInfo.sType =
        VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
    vkExternalMemImageCreateInfo.pNext = NULL;
 #ifdef _WIN64
    vkExternalMemImageCreateInfo.handleTypes =
        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
 #else
    vkExternalMemImageCreateInfo.handleTypes =
        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
 #endif
    imageInfo.pNext = &vkExternalMemImageCreateInfo;
@ -2201,7 +2207,6 @@ class vulkanImageCUDA {
      throw std::runtime_error(
          "failed to create synchronization objects for a CUDA-Vulkan!");
    }
  }
  void updateUniformBuffer() {
@ -2333,8 +2338,8 @@ class vulkanImageCUDA {
    submitInfo.signalSemaphoreCount = 2;
    submitInfo.pSignalSemaphores = signalSemaphores;
-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) !=
+    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
-        VK_SUCCESS) {
+                      inFlightFences[currentFrame]) != VK_SUCCESS) {
      throw std::runtime_error("failed to submit draw command buffer!");
    }
  }
@ -2360,8 +2365,8 @@ class vulkanImageCUDA {
    submitInfo.signalSemaphoreCount = 2;
    submitInfo.pSignalSemaphores = signalSemaphores;
-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) !=
+    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
-        VK_SUCCESS) {
+                      inFlightFences[currentFrame]) != VK_SUCCESS) {
      throw std::runtime_error("failed to submit draw command buffer!");
    }
  }