update vulkan samples with SPIR-V shaders

2025-12-16 02:27:49 +08:00 · 2021-06-02 17:17:21 +05:30 · 2021-06-02 17:17:21 +05:30 · 7a5b3e6c8c
commit 7a5b3e6c8c
parent 5c3ec60fae
17 changed files with 2368 additions and 2116 deletions
--- a/Samples/simpleVulkan/Build_instructions.txt
+++ b/Samples/simpleVulkan/Build_instructions.txt
@ -19,8 +19,17 @@ For Linux:
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH

+
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+
+
+For Shader changes:
+-- Update the sinewave.vert and/or sinewave.frag shader source file as required
+-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
+    glslc sinewave.vert -o vert.spv
+    glslc sinewave.frag -o frag.spv
+** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/simpleVulkan/frag.spv
+++ b/Samples/simpleVulkan/frag.spv
--- a/Samples/simpleVulkan/main.cpp
+++ b/Samples/simpleVulkan/main.cpp
@ -92,9 +92,9 @@ class VulkanCudaSineWave : public VulkanBaseApp {
    }
    // Add our compiled vulkan shader files
    char *vertex_shader_path =
-        sdkFindFilePath("sinewave.vert", execution_path.c_str());
+        sdkFindFilePath("vert.spv", execution_path.c_str());
    char *fragment_shader_path =
-        sdkFindFilePath("sinewave.frag", execution_path.c_str());
+        sdkFindFilePath("frag.spv", execution_path.c_str());
    m_shaderFiles.push_back(
        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
    m_shaderFiles.push_back(
--- a/Samples/simpleVulkan/vert.spv
+++ b/Samples/simpleVulkan/vert.spv
--- a/Samples/simpleVulkanMMAP/Build_instructions.txt
+++ b/Samples/simpleVulkanMMAP/Build_instructions.txt
@ -0,0 +1,35 @@
+For Windows:
+Follow these steps once you have installed Vulkan SDK for Windows from https://www.lunarg.com/vulkan-sdk/
+-- Install GLFW3 library at suitable location
+-- Open the simpleVulkan VS project file.
+To add the GLFW3 library path
+-- Right click on Project name "simpleVulkan" click on "Properties"
+-- In Property pages window go to Linker -> General. Here in "Additional Libraries Directories" edit and add path to glfw3dll.lib
+To add the GLFW3 headers path
+-- Right click on Project name "simpleVulkan" click on "Properties"
+-- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location.
+** Make sure to add path to glfw3.dll in your PATH environment variable**
+
+
+For Linux:
+-- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/  and follow environment setup instructions.
+-- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS. Below is for Ubuntu:
+    sudo apt-get install libglfw3
+    sudo apt-get install libglfw3-dev
+-- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
+-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
+
+
+For Linux aarch64(L4T):
+-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
+-- install above will also provide libvulkan-dev as dependencies
+-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
+-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+
+
+For Shader changes:
+-- Update the montecarlo.vert and/or montecarlo.frag shader source file as required
+-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
+    glslc montecarlo.vert -o vert.spv
+    glslc montecarlo.frag -o frag.spv
+** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/simpleVulkanMMAP/MonteCarloPi.cu
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.cu
@ -25,7 +25,7 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

- /*
+/*
 * See: https://www.piday.org/million/
 */

@ -37,15 +37,16 @@

 #define ROUND_UP_TO_GRANULARITY(x, n) (((x + n - 1) / n) * n)

-  // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
-  // uses for importing and exporting memory allocation. On Linux this sample
-  // specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
-  // file descriptors will be used. On Windows this sample specifies the type as
-  // CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
-  // ipcHandleTypeFlag variable is a convenience variable and is passed by value
-  // to individual requests.
+// `ipcHandleTypeFlag` specifies the platform specific handle type this sample
+// uses for importing and exporting memory allocation. On Linux this sample
+// specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
+// file descriptors will be used. On Windows this sample specifies the type as
+// CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
+// ipcHandleTypeFlag variable is a convenience variable and is passed by value
+// to individual requests.
 #if defined(__linux__)
-CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+CUmemAllocationHandleType ipcHandleTypeFlag =
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
 #else
 CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
 #endif
@ -78,8 +79,9 @@ void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) {
 #endif
 }

-__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, float *numPointsInCircle, unsigned int numPoints, float time)
-{
+__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle,
+                                   float *numPointsInCircle,
+                                   unsigned int numPoints, float time) {
  const size_t stride = gridDim.x * blockDim.x;
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  float count = 0.0f;
@ -96,26 +98,24 @@ __global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, fl
    xyVector[tid][1] = y;

    // Compute the distance of this point form the center(0, 0)
-        float dist = sqrtf((x*x) + (y*y));
+    float dist = sqrtf((x * x) + (y * y));

-        // If distance is less than the radius of the unit circle, the point lies in the circle.
+    // If distance is less than the radius of the unit circle, the point lies in
+    // the circle.
    pointsInsideCircle[tid] = (dist <= 1.0f);
    count += (dist <= 1.0f);
  }
  atomicAdd(numPointsInCircle, count);
 }

-MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points) :
-    m_xyVector(nullptr),
+MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points)
+    : m_xyVector(nullptr),
      m_pointsInsideCircle(nullptr),
      m_totalPointsInsideCircle(0),
      m_totalPointsSimulated(0),
-    m_numPoints(num_points)
-{
-}
+      m_numPoints(num_points) {}

-MonteCarloPiSimulation::~MonteCarloPiSimulation()
-{
+MonteCarloPiSimulation::~MonteCarloPiSimulation() {
  if (m_numPointsInCircle) {
    checkCudaErrors(cudaFree(m_numPointsInCircle));
    m_numPointsInCircle = nullptr;
@ -128,70 +128,82 @@ MonteCarloPiSimulation::~MonteCarloPiSimulation()
  cleanupSimulationAllocations();
 }

-void MonteCarloPiSimulation::initSimulation(int cudaDevice, cudaStream_t stream)
-{
+void MonteCarloPiSimulation::initSimulation(int cudaDevice,
+                                            cudaStream_t stream) {
  m_cudaDevice = cudaDevice;
  getIdealExecutionConfiguration();

-    // Allocate a position buffer that contains random location of the points in XY cartesian plane.
-    // Allocate a bitmap buffer which holds information of whether a point in the position buffer is inside the unit circle or not.
+  // Allocate a position buffer that contains random location of the points in
+  // XY cartesian plane.
+  // Allocate a bitmap buffer which holds information of whether a point in the
+  // position buffer is inside the unit circle or not.
  setupSimulationAllocations();

-    checkCudaErrors(cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
-    checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, sizeof(*m_hostNumPointsInCircle)));
+  checkCudaErrors(
+      cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
+  checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle,
+                                 sizeof(*m_hostNumPointsInCircle)));
 }

-void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream)
-{
+void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) {
+  checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0,
+                                  sizeof(*m_numPointsInCircle), stream));

-    checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, sizeof(*m_numPointsInCircle), stream));
-
-    monte_carlo_kernel << < m_blocks, m_threads, 0, stream >> > (m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
+  monte_carlo_kernel<<<m_blocks, m_threads, 0, stream>>>(
+      m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
  getLastCudaError("Failed to launch CUDA simulation");

-    checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle, sizeof(*m_numPointsInCircle), cudaMemcpyDeviceToHost, stream));
+  checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle,
+                                  sizeof(*m_numPointsInCircle),
+                                  cudaMemcpyDeviceToHost, stream));

  // Queue up a stream callback to compute and print the PI value.
-    checkCudaErrors(cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
+  checkCudaErrors(
+      cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
 }

-void MonteCarloPiSimulation::computePiCallback(void *args)
-{
+void MonteCarloPiSimulation::computePiCallback(void *args) {
  MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
  cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
  cbData->m_totalPointsSimulated += cbData->m_numPoints;
-    double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / (double)cbData->m_totalPointsSimulated);
-    printf("Approximate Pi value for %zd data points: %lf \n", cbData->m_totalPointsSimulated, piValue);
+  double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle /
+                          (double)cbData->m_totalPointsSimulated);
+  printf("Approximate Pi value for %zd data points: %lf \n",
+         cbData->m_totalPointsSimulated, piValue);
 }

-void MonteCarloPiSimulation::getIdealExecutionConfiguration()
-{
+void MonteCarloPiSimulation::getIdealExecutionConfiguration() {
  int warpSize = 0;
  int multiProcessorCount = 0;

  checkCudaErrors(cudaSetDevice(m_cudaDevice));
-    checkCudaErrors(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));

-    // We don't need large block sizes, since there's not much inter-thread communication
+  // We don't need large block sizes, since there's not much inter-thread
+  // communication
  m_threads = warpSize;

  // Use the occupancy calculator and fill the gpu as best as we can
-    checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, monte_carlo_kernel, warpSize, 0));
+  checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &m_blocks, monte_carlo_kernel, warpSize, 0));

-    checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
+  checkCudaErrors(cudaDeviceGetAttribute(
+      &multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
  m_blocks *= multiProcessorCount;

-    // Go ahead and the clamp the blocks to the minimum needed for this height/width
-    m_blocks = std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
+  // Go ahead and the clamp the blocks to the minimum needed for this
+  // height/width
+  m_blocks =
+      std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
 }

-void MonteCarloPiSimulation::setupSimulationAllocations()
-{
+void MonteCarloPiSimulation::setupSimulationAllocations() {
  CUdeviceptr d_ptr = 0U;
  size_t granularity = 0;
  CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;

-    CUmemAllocationProp allocProp = { };
+  CUmemAllocationProp allocProp = {};
  allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
  allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  allocProp.location.id = m_cudaDevice;
@ -205,30 +217,39 @@ void MonteCarloPiSimulation::setupSimulationAllocations()
  getDefaultSecurityDescriptor(&allocProp);

  // Get the recommended granularity for m_cudaDevice.
-    checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
+  checkCudaErrors(cuMemGetAllocationGranularity(
+      &granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));

  size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
  size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);

-    size_t xyPositionSize = ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
+  size_t xyPositionSize =
+      ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
  size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
  m_totalAllocationSize = (xyPositionSize + inCircleSize);

  // Reserve the required contiguous VA space for the allocations
-    checkCudaErrors(cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
+  checkCudaErrors(
+      cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));

  // Create the allocations as a pinned allocation on this device.
-    // Create an allocation to store all the positions of points on the xy plane and a second
-    // allocation which stores information if the corresponding position is inside the unit circle or not.
-    checkCudaErrors(cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
-    checkCudaErrors(cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
+  // Create an allocation to store all the positions of points on the xy plane
+  // and a second allocation which stores information if the corresponding
+  // position is inside the unit circle or not.
+  checkCudaErrors(
+      cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
+  checkCudaErrors(
+      cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));

  // Export the allocation to a platform-specific handle. The type of handle
  // requested here must match the requestedHandleTypes field in the prop
-    // structure passed to cuMemCreate. The handle obtained here will be passed to vulkan
-    // to import the allocation.
-    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
-    checkCudaErrors(cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, cudaInCircleHandle, ipcHandleTypeFlag, 0));
+  // structure passed to cuMemCreate. The handle obtained here will be passed to
+  // vulkan to import the allocation.
+  checkCudaErrors(cuMemExportToShareableHandle(
+      (void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
+  checkCudaErrors(
+      cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle,
+                                   cudaInCircleHandle, ipcHandleTypeFlag, 0));

  CUdeviceptr va_position = d_ptr;
  CUdeviceptr va_InCircle = va_position + xyPositionSize;
@ -236,12 +257,15 @@ void MonteCarloPiSimulation::setupSimulationAllocations()
  m_xyVector = (vec2 *)va_position;

  // Assign the chunk to the appropriate VA range
-    checkCudaErrors(cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
-    checkCudaErrors(cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
+  checkCudaErrors(
+      cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
+  checkCudaErrors(
+      cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));

-    // Release the handles for the allocation. Since the allocation is currently mapped to a VA range
-    // with a previous call to cuMemMap the actual freeing of memory allocation will happen on an eventual call to
-    // cuMemUnmap. Thus the allocation will be kept live until it is unmapped.
+  // Release the handles for the allocation. Since the allocation is currently
+  // mapped to a VA range with a previous call to cuMemMap the actual freeing of
+  // memory allocation will happen on an eventual call to cuMemUnmap. Thus the
+  // allocation will be kept live until it is unmapped.
  checkCudaErrors(cuMemRelease(cudaPositionHandle));
  checkCudaErrors(cuMemRelease(cudaInCircleHandle));

@ -250,12 +274,13 @@ void MonteCarloPiSimulation::setupSimulationAllocations()
  accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
  accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;

-    // Apply the access descriptor to the whole VA range. Essentially enables Read-Write access to the range.
-    checkCudaErrors(cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
+  // Apply the access descriptor to the whole VA range. Essentially enables
+  // Read-Write access to the range.
+  checkCudaErrors(
+      cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
 }

-void MonteCarloPiSimulation::cleanupSimulationAllocations()
-{
+void MonteCarloPiSimulation::cleanupSimulationAllocations() {
  if (m_xyVector && m_pointsInsideCircle) {
    // Unmap the mapped virtual memory region
    // Since the handles to the mapped backing stores have already been released
@ -267,7 +292,8 @@ void MonteCarloPiSimulation::cleanupSimulationAllocations()
    checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));

    // Free the virtual address region.
-        checkCudaErrors(cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));
+    checkCudaErrors(
+        cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));

    m_xyVector = nullptr;
    m_pointsInsideCircle = nullptr;
--- a/Samples/simpleVulkanMMAP/MonteCarloPi.h
+++ b/Samples/simpleVulkanMMAP/MonteCarloPi.h
@ -39,33 +39,35 @@

 typedef float vec2[2];

-class MonteCarloPiSimulation
-{
+class MonteCarloPiSimulation {
  size_t m_numPoints;

-    // Pointers to Cuda allocated buffers which are imported and used by vulkan as vertex buffer
+  // Pointers to Cuda allocated buffers which are imported and used by vulkan as
+  // vertex buffer
  vec2 *m_xyVector;
  float *m_pointsInsideCircle;

-    // Pointers to device and host allocated memories storing number of points that are inside the unit circle
+  // Pointers to device and host allocated memories storing number of points
+  // that are inside the unit circle
  float *m_numPointsInCircle;
  float *m_hostNumPointsInCircle;

  int m_blocks, m_threads;

-    // Total size of allocations created by cuMemMap Apis. This size is the sum of sizes of
-    // m_xyVector and m_pointsInsideCircle buffers.
+  // Total size of allocations created by cuMemMap Apis. This size is the sum of
+  // sizes of m_xyVector and m_pointsInsideCircle buffers.
  size_t m_totalAllocationSize;

-    // Shareable Handles(a file descriptor on Linux and NT Handle on Windows), used for sharing cuda
+  // Shareable Handles(a file descriptor on Linux and NT Handle on Windows),
+  // used for sharing cuda
  // allocated memory with Vulkan
  ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;

  // Cuda Device corresponding to the Vulkan Physical device
  int m_cudaDevice;

-    // Track and accumulate total points that have been simulated since start of the sample.
-    // The idea is to get a closer approximation to PI with time.
+  // Track and accumulate total points that have been simulated since start of
+  // the sample. The idea is to get a closer approximation to PI with time.
  size_t m_totalPointsInsideCircle;
  size_t m_totalPointsSimulated;

@ -73,28 +75,21 @@ class MonteCarloPiSimulation
  void cleanupSimulationAllocations();
  void getIdealExecutionConfiguration();

-public:
+ public:
  MonteCarloPiSimulation(size_t num_points);
  ~MonteCarloPiSimulation();
  void initSimulation(int cudaDevice, cudaStream_t stream = 0);
  void stepSimulation(float time, cudaStream_t stream = 0);
  static void computePiCallback(void *args);

-    size_t getNumPoints() const {
-        return m_numPoints;
-    }
+  size_t getNumPoints() const { return m_numPoints; }

-    float getNumPointsInCircle() const {
-        return *m_hostNumPointsInCircle;
-    }
+  float getNumPointsInCircle() const { return *m_hostNumPointsInCircle; }

-    ShareableHandle &getPositionShareableHandle() {
-        return m_posShareableHandle;
-    }
+  ShareableHandle &getPositionShareableHandle() { return m_posShareableHandle; }
  ShareableHandle &getInCircleShareableHandle() {
    return m_inCircleShareableHandle;
  }
-
 };

 #endif  // __PISIM_H__
--- a/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.cpp
--- a/Samples/simpleVulkanMMAP/VulkanBaseApp.h
+++ b/Samples/simpleVulkanMMAP/VulkanBaseApp.h
@ -40,26 +40,38 @@

 struct GLFWwindow;

-class VulkanBaseApp
-{
-public:
+class VulkanBaseApp {
+ public:
  VulkanBaseApp(const std::string& appName, bool enableValidation = false);
  static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
  static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
  virtual ~VulkanBaseApp();
  void init();
-    void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType);
-    void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    bool isVkPhysicalDeviceUuid(void *Uuid);
-    void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
-    void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory);
-    void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory);
+  void* getMemHandle(VkDeviceMemory memory,
+                     VkExternalMemoryHandleTypeFlagBits handleType);
+  void* getSemaphoreHandle(VkSemaphore semaphore,
+                           VkExternalSemaphoreHandleTypeFlagBits handleType);
+  bool isVkPhysicalDeviceUuid(void* Uuid);
+  void createExternalSemaphore(
+      VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
+  void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                    VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                    VkDeviceMemory& bufferMemory);
+  void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
+                            VkMemoryPropertyFlags properties,
+                            VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
+                            VkBuffer& buffer, VkDeviceMemory& bufferMemory);
+  void importExternalBuffer(void* handle,
+                            VkExternalMemoryHandleTypeFlagBits handleType,
+                            size_t size, VkBufferUsageFlags usage,
+                            VkMemoryPropertyFlags properties, VkBuffer& buffer,
+                            VkDeviceMemory& memory);
  void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
  VkCommandBuffer beginSingleTimeCommands();
  void endSingleTimeCommands(VkCommandBuffer commandBuffer);
  void mainLoop();
-protected:
+
+ protected:
  const std::string m_appName;
  const bool m_enableValidation;
  VkInstance m_instance;
@ -99,17 +111,23 @@ protected:

  virtual void initVulkanApp() {}
  virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
-    virtual std::vector<const char *> getRequiredExtensions() const;
-    virtual std::vector<const char *> getRequiredDeviceExtensions() const;
-    virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc);
-    virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info);
-    virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const;
+  virtual std::vector<const char*> getRequiredExtensions() const;
+  virtual std::vector<const char*> getRequiredDeviceExtensions() const;
+  virtual void getVertexDescriptions(
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
+      std::vector<VkVertexInputAttributeDescription>& attribDesc);
+  virtual void getAssemblyStateInfo(
+      VkPipelineInputAssemblyStateCreateInfo& info);
+  virtual void getWaitFrameSemaphores(
+      std::vector<VkSemaphore>& wait,
+      std::vector<VkPipelineStageFlags>& waitStages) const;
  virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
  virtual VkDeviceSize getUniformSize() const;
  virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
  virtual void drawFrame();
-private:
-    GLFWwindow *m_window;
+
+ private:
+  GLFWwindow* m_window;

  void initWindow();
  void initVulkan();
@ -134,7 +152,7 @@ private:
  void recreateSwapChain();

  bool isSuitableDevice(VkPhysicalDevice dev) const;
-    static void resizeCallback(GLFWwindow *window, int width, int height);
+  static void resizeCallback(GLFWwindow* window, int width, int height);
 };

 void readFile(std::istream& s, std::vector<char>& data);
--- a/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
+++ b/Samples/simpleVulkanMMAP/VulkanCudaInterop.h
@ -35,13 +35,12 @@
 #include <helper_cuda.h>

 bool isDeviceCompatible(void *Uuid, size_t size) {
-
  int cudaDevice = cudaInvalidDeviceId;
  int deviceCount;
  checkCudaErrors(cudaGetDeviceCount(&deviceCount));

  for (int i = 0; i < deviceCount; ++i) {
-        cudaDeviceProp devProp = { };
+    cudaDeviceProp devProp = {};
    checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
    if (!memcmp(&devProp.uuid, Uuid, size)) {
      cudaDevice = i;
@ -56,20 +55,28 @@ bool isDeviceCompatible(void *Uuid, size_t size) {
  int attributeVal = 0;
  int deviceComputeMode = 0;

-    checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
-    checkCudaErrors(cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+      cudaDevice));

 #if defined(__linux__)
-    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &deviceSupportsHandle,
+      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED,
+      cudaDevice));
 #else
-    checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
+  checkCudaErrors(cuDeviceGetAttribute(
+      &deviceSupportsHandle,
+      CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
 #endif

-    if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || !deviceSupportsHandle) {
+  if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal ||
+      !deviceSupportsHandle) {
    return false;
  }
  return true;
 }

 #endif  // __VKCUDA_H__
-
--- a/Samples/simpleVulkanMMAP/frag.spv
+++ b/Samples/simpleVulkanMMAP/frag.spv
--- a/Samples/simpleVulkanMMAP/main.cpp
+++ b/Samples/simpleVulkanMMAP/main.cpp
@ -25,10 +25,11 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

- /*
+/*
 * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
-  * Allocating device memory and updating values in those allocations are performed by CUDA
-  * and the contents of the allocation are visualized by Vulkan.
+ * Allocating device memory and updating values in those allocations are
+ * performed by CUDA and the contents of the allocation are visualized by
+ * Vulkan.
 */

 #include "VulkanBaseApp.h"
@ -55,11 +56,8 @@

 std::string execution_path;

-class VulkanCudaPi : public VulkanBaseApp
-{
-    typedef struct UniformBufferObject_st {
-        float frame;
-    } UniformBufferObject;
+class VulkanCudaPi : public VulkanBaseApp {
+  typedef struct UniformBufferObject_st { float frame; } UniformBufferObject;

  VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
  VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
@ -71,9 +69,10 @@ class VulkanCudaPi : public VulkanBaseApp
  using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
  chrono_tp m_lastTime;
  size_t m_lastFrame;
-public:
-    VulkanCudaPi(size_t num_points) :
-        VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
+
+ public:
+  VulkanCudaPi(size_t num_points)
+      : VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
        m_inCircleBuffer(VK_NULL_HANDLE),
        m_xyPositionBuffer(VK_NULL_HANDLE),
        m_inCircleMemory(VK_NULL_HANDLE),
@ -86,12 +85,15 @@ public:
        m_cudaWaitSemaphore(),
        m_cudaSignalSemaphore(),
        m_lastFrame(0) {
-
    // Add our compiled vulkan shader files
-        char* vertex_shader_path = sdkFindFilePath("montecarlo.vert", execution_path.c_str());
-        char* fragment_shader_path = sdkFindFilePath("montecarlo.frag", execution_path.c_str());
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
-        m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
+    char* vertex_shader_path =
+        sdkFindFilePath("vert.spv", execution_path.c_str());
+    char* fragment_shader_path =
+        sdkFindFilePath("frag.spv", execution_path.c_str());
+    m_shaderFiles.push_back(
+        std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
+    m_shaderFiles.push_back(
+        std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
  }

  ~VulkanCudaPi() {
@ -124,13 +126,17 @@ public:
  }

  void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
-        VkBuffer vertexBuffers[] = { m_inCircleBuffer, m_xyPositionBuffer };
-        VkDeviceSize offsets[] = { 0, 0 };
-        vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets);
+    VkBuffer vertexBuffers[] = {m_inCircleBuffer, m_xyPositionBuffer};
+    VkDeviceSize offsets[] = {0, 0};
+    vkCmdBindVertexBuffers(commandBuffer, 0,
+                           sizeof(vertexBuffers) / sizeof(vertexBuffers[0]),
+                           vertexBuffers, offsets);
    vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
  }

-    void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) {
+  void getVertexDescriptions(
+      std::vector<VkVertexInputBindingDescription>& bindingDesc,
+      std::vector<VkVertexInputAttributeDescription>& attribDesc) {
    bindingDesc.resize(2);
    attribDesc.resize(2);

@ -159,30 +165,37 @@ public:
    info.primitiveRestartEnable = VK_FALSE;
  }

-    void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const {
+  void getWaitFrameSemaphores(
+      std::vector<VkSemaphore>& wait,
+      std::vector<VkPipelineStageFlags>& waitStages) const {
    if (m_currentFrame != 0) {
-            // Have vulkan wait until cuda is done with the vertex buffer before rendering
-            // We don't do this on the first frame, as the wait semaphore hasn't been initialized yet
+      // Have vulkan wait until cuda is done with the vertex buffer before
+      // rendering
+      // We don't do this on the first frame, as the wait semaphore hasn't been
+      // initialized yet
      wait.push_back(m_vkWaitSemaphore);
-            // We want to wait until all the pipeline commands are complete before letting cuda work
+      // We want to wait until all the pipeline commands are complete before
+      // letting cuda work
      waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
    }
  }

  void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
-        // Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify
+    // Add this semaphore for vulkan to signal once the vertex buffer is ready
+    // for cuda to modify
    signal.push_back(m_vkSignalSemaphore);
  }

  void initVulkanApp() {
    const size_t nVerts = m_sim.getNumPoints();

-        // Obtain cuda device id for the device corresponding to the Vulkan physical device
+    // Obtain cuda device id for the device corresponding to the Vulkan physical
+    // device
    int deviceCount;
    int cudaDevice = cudaInvalidDeviceId;
    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
    for (int dev = 0; dev < deviceCount; ++dev) {
-            cudaDeviceProp devProp = { };
+      cudaDeviceProp devProp = {};
      checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
      if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
        cudaDevice = dev;
@ -195,75 +208,94 @@ public:

    // On the corresponding cuda device, create the cuda stream we'll using
    checkCudaErrors(cudaSetDevice(cudaDevice));
-        checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
+    checkCudaErrors(
+        cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
    m_sim.initSimulation(cudaDevice, m_stream);

-        importExternalBuffer((void *)(uintptr_t)m_sim.getPositionShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(vec2),
+    importExternalBuffer(
+        (void*)(uintptr_t)m_sim.getPositionShareableHandle(),
+        getDefaultMemHandleType(), nVerts * sizeof(vec2),
        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer, m_xyPositionMemory);
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer,
+        m_xyPositionMemory);

-        importExternalBuffer((void *)(uintptr_t)m_sim.getInCircleShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(float),
+    importExternalBuffer(
+        (void*)(uintptr_t)m_sim.getInCircleShareableHandle(),
+        getDefaultMemHandleType(), nVerts * sizeof(float),
        VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-            VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer, m_inCircleMemory);
+        VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer,
+        m_inCircleMemory);

-        // Create the semaphore vulkan will signal when it's done with the vertex buffer
-        createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
+    // Create the semaphore vulkan will signal when it's done with the vertex
+    // buffer
+    createExternalSemaphore(m_vkSignalSemaphore,
+                            getDefaultSemaphoreHandleType());
    // Create the semaphore vulkan will wait for before using the vertex buffer
    createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
    // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
-        importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType());
+    importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore,
+                                getDefaultSemaphoreHandleType());
    // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
-        importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
+    importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore,
+                                getDefaultSemaphoreHandleType());
  }

-    void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) {
+  void importCudaExternalSemaphore(
+      cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem,
+      VkExternalSemaphoreHandleTypeFlagBits handleType) {
    cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};

    if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
-        }
-        else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
-            externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd;
-        }
-        else {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueWin32;
+    } else if (handleType &
+               VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
+    } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
+      externalSemaphoreHandleDesc.type =
+          cudaExternalSemaphoreHandleTypeOpaqueFd;
+    } else {
      throw std::runtime_error("Unknown handle type requested!");
    }

 #ifdef _WIN64
-        externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.win32.handle =
+        (HANDLE)getSemaphoreHandle(vkSem, handleType);
 #else
-        externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
+    externalSemaphoreHandleDesc.handle.fd =
+        (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
 #endif

    externalSemaphoreHandleDesc.flags = 0;

-        checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
+    checkCudaErrors(
+        cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
  }

-    VkDeviceSize getUniformSize() const {
-        return sizeof(UniformBufferObject);
-    }
+  VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); }

  void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
    m_ubo.frame = (float)globalFrame;
-        void *data;
-        vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data);
+    void* data;
+    vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0,
+                &data);
    memcpy(data, &m_ubo, sizeof(m_ubo));
    vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
  }

-    std::vector<const char *> getRequiredExtensions() const {
-        std::vector<const char *> extensions;
+  std::vector<const char*> getRequiredExtensions() const {
+    std::vector<const char*> extensions;
    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME);
+    extensions.push_back(
+        VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
    return extensions;
  }

-    std::vector<const char *> getRequiredDeviceExtensions() const {
-        std::vector<const char *> extensions;
+  std::vector<const char*> getRequiredDeviceExtensions() const {
+    std::vector<const char*> extensions;

    extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
    extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
@ -281,7 +313,9 @@ public:
    static chrono_tp startTime = std::chrono::high_resolution_clock::now();

    chrono_tp currentTime = std::chrono::high_resolution_clock::now();
-        float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count();
+    float time = std::chrono::duration<float, std::chrono::seconds::period>(
+                     currentTime - startTime)
+                     .count();

    if (m_currentFrame == 0) {
      m_lastTime = startTime;
@ -298,17 +332,18 @@ public:
    // Have vulkan draw the current frame...
    VulkanBaseApp::drawFrame();
    // Wait for vulkan to complete it's work
-        checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream));
+    checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore,
+                                                    &waitParams, 1, m_stream));
    // Now step the simulation
    m_sim.stepSimulation(time, m_stream);

    // Signal vulkan to continue with the updated buffers
-        checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
+    checkCudaErrors(cudaSignalExternalSemaphoresAsync(
+        &m_cudaSignalSemaphore, &signalParams, 1, m_stream));
  }
 };

-int main(int argc, char **argv)
-{
+int main(int argc, char** argv) {
  execution_path = argv[0];
  VulkanCudaPi app(NUM_SIMULATION_POINTS);
  app.init();
--- a/Samples/simpleVulkanMMAP/vert.spv
+++ b/Samples/simpleVulkanMMAP/vert.spv
--- a/Samples/vulkanImageCUDA/Build_instructions.txt
+++ b/Samples/vulkanImageCUDA/Build_instructions.txt
@ -19,8 +19,17 @@ For Linux:
 -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH

+
 For Linux aarch64(L4T):
 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 
 -- install above will also provide libvulkan-dev as dependencies
 -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
 -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
+
+
+For Shader changes:
+-- Update the shader.vert and/or shader.frag shader source file as required
+-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
+    glslc shader.vert -o vert.spv
+    glslc shader.frag -o frag.spv
+** Make sure to add glslc's path in your PATH environment variable **
--- a/Samples/vulkanImageCUDA/frag.spv
+++ b/Samples/vulkanImageCUDA/frag.spv
--- a/Samples/vulkanImageCUDA/vert.spv
+++ b/Samples/vulkanImageCUDA/vert.spv
--- a/Samples/vulkanImageCUDA/vulkanImageCUDA.cu
+++ b/Samples/vulkanImageCUDA/vulkanImageCUDA.cu
@ -69,7 +69,7 @@ const std::vector<const char*> validationLayers = {
    "VK_LAYER_KHRONOS_validation"};

 #ifdef NDEBUG
-const bool enableValidationLayers = false;
+const bool enableValidationLayers = true;
 #else
 const bool enableValidationLayers = false;
 #endif
@ -494,7 +494,7 @@ class vulkanImageCUDA {

  unsigned int* image_data = NULL;
  unsigned int imageWidth, imageHeight;
-  unsigned int mipLevels;
+  unsigned int mipLevels = 1;
  size_t totalImageMemSize;

  // CUDA objects
@ -630,6 +630,9 @@ class vulkanImageCUDA {
    vkDestroyBuffer(device, vertexBuffer, nullptr);
    vkFreeMemory(device, vertexBufferMemory, nullptr);

+    vkDestroySemaphore(device, cudaUpdateVkSemaphore, nullptr);
+    vkDestroySemaphore(device, vkUpdateCudaSemaphore, nullptr);
+
    for (size_t i = 0; i < MAX_FRAMES; i++) {
      vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr);
      vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr);
@ -686,7 +689,7 @@ class vulkanImageCUDA {
    appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
    appInfo.pEngineName = "No Engine";
    appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
-    appInfo.apiVersion = VK_API_VERSION_1_0;
+    appInfo.apiVersion = VK_API_VERSION_1_1;

    VkInstanceCreateInfo createInfo = {};
    createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
@ -905,6 +908,7 @@ class vulkanImageCUDA {
    }

    VkPhysicalDeviceFeatures deviceFeatures = {};
+    deviceFeatures.samplerAnisotropy = VK_TRUE;

    VkDeviceCreateInfo createInfo = {};
    createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
@ -1078,8 +1082,8 @@ class vulkanImageCUDA {
  }

  void createGraphicsPipeline() {
-    auto vertShaderCode = readFile("shader.vert");
-    auto fragShaderCode = readFile("shader.frag");
+    auto vertShaderCode = readFile("vert.spv");
+    auto fragShaderCode = readFile("frag.spv");

    VkShaderModule vertShaderModule = createShaderModule(vertShaderCode);
    VkShaderModule fragShaderModule = createShaderModule(fragShaderCode);
@ -1268,7 +1272,7 @@ class vulkanImageCUDA {

    // VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT
    createImage(
-        imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UINT,
+        imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UNORM,
        VK_IMAGE_TILING_OPTIMAL,
        VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
            VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
@ -1280,9 +1284,6 @@ class vulkanImageCUDA {
    copyBufferToImage(stagingBuffer, textureImage,
                      static_cast<uint32_t>(imageWidth),
                      static_cast<uint32_t>(imageHeight));
-    transitionImageLayout(textureImage, VK_FORMAT_R8G8B8A8_UINT,
-                          VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
-                          VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);

    vkDestroyBuffer(device, stagingBuffer, nullptr);
    vkFreeMemory(device, stagingBufferMemory, nullptr);
@ -1523,8 +1524,13 @@ class vulkanImageCUDA {
    vkExternalMemImageCreateInfo.sType =
        VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
    vkExternalMemImageCreateInfo.pNext = NULL;
+#ifdef _WIN64
+    vkExternalMemImageCreateInfo.handleTypes =
+        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+#else
    vkExternalMemImageCreateInfo.handleTypes =
        VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
+#endif

    imageInfo.pNext = &vkExternalMemImageCreateInfo;

@ -2201,7 +2207,6 @@ class vulkanImageCUDA {
      throw std::runtime_error(
          "failed to create synchronization objects for a CUDA-Vulkan!");
    }
-
  }

  void updateUniformBuffer() {
@ -2333,8 +2338,8 @@ class vulkanImageCUDA {
    submitInfo.signalSemaphoreCount = 2;
    submitInfo.pSignalSemaphores = signalSemaphores;

-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) !=
-        VK_SUCCESS) {
+    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
+                      inFlightFences[currentFrame]) != VK_SUCCESS) {
      throw std::runtime_error("failed to submit draw command buffer!");
    }
  }
@ -2360,8 +2365,8 @@ class vulkanImageCUDA {
    submitInfo.signalSemaphoreCount = 2;
    submitInfo.pSignalSemaphores = signalSemaphores;

-    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) !=
-        VK_SUCCESS) {
+    if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
+                      inFlightFences[currentFrame]) != VK_SUCCESS) {
      throw std::runtime_error("failed to submit draw command buffer!");
    }
  }