/* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of NVIDIA CORPORATION nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "VulkanBaseApp.h" #include #include #include #include #include "linmath.h" #include "SineWaveSimulation.h" #include typedef float vec2[2]; std::string execution_path; #ifndef NDEBUG #define ENABLE_VALIDATION (false) #else #define ENABLE_VALIDATION (true) #endif class VulkanCudaSineWave : public VulkanBaseApp { typedef struct UniformBufferObject_st { mat4x4 modelViewProj; } UniformBufferObject; VkBuffer m_heightBuffer, m_xyBuffer, m_indexBuffer; VkDeviceMemory m_heightMemory, m_xyMemory, m_indexMemory; UniformBufferObject m_ubo; VkSemaphore m_vkWaitSemaphore, m_vkSignalSemaphore; SineWaveSimulation m_sim; cudaStream_t m_stream; cudaExternalSemaphore_t m_cudaWaitSemaphore, m_cudaSignalSemaphore, m_cudaTimelineSemaphore; cudaExternalMemory_t m_cudaVertMem; float *m_cudaHeightMap; using chrono_tp = std::chrono::time_point; chrono_tp m_lastTime; size_t m_lastFrame; public: VulkanCudaSineWave(size_t width, size_t height) : VulkanBaseApp("vulkanCudaSineWave", ENABLE_VALIDATION), m_heightBuffer(VK_NULL_HANDLE), m_xyBuffer(VK_NULL_HANDLE), m_indexBuffer(VK_NULL_HANDLE), m_heightMemory(VK_NULL_HANDLE), m_xyMemory(VK_NULL_HANDLE), m_indexMemory(VK_NULL_HANDLE), m_ubo(), m_sim(width, height), m_stream(0), m_vkWaitSemaphore(VK_NULL_HANDLE), m_vkSignalSemaphore(VK_NULL_HANDLE), m_cudaWaitSemaphore(), m_cudaSignalSemaphore(), m_cudaTimelineSemaphore(), m_cudaVertMem(), m_cudaHeightMap(nullptr), m_lastFrame(0) { // Our index buffer can only index 32-bits of the vertex buffer if ((width * height) > (1ULL << 32ULL)) { throw std::runtime_error( "Requested height and width is too large for this sample!"); } // Add our compiled vulkan shader files char *vertex_shader_path = sdkFindFilePath("vert.spv", execution_path.c_str()); char *fragment_shader_path = sdkFindFilePath("frag.spv", execution_path.c_str()); m_shaderFiles.push_back( std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); m_shaderFiles.push_back( std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); } ~VulkanCudaSineWave() { // Make sure there's no pending work before we start tearing down checkCudaErrors(cudaStreamSynchronize(m_stream)); #ifdef _VK_TIMELINE_SEMAPHORE if (m_vkTimelineSemaphore != VK_NULL_HANDLE) { checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaTimelineSemaphore)); vkDestroySemaphore(m_device, m_vkTimelineSemaphore, nullptr); } #endif /* _VK_TIMELINE_SEMAPHORE */ if (m_vkSignalSemaphore != VK_NULL_HANDLE) { checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaSignalSemaphore)); vkDestroySemaphore(m_device, m_vkSignalSemaphore, nullptr); } if (m_vkWaitSemaphore != VK_NULL_HANDLE) { checkCudaErrors(cudaDestroyExternalSemaphore(m_cudaWaitSemaphore)); vkDestroySemaphore(m_device, m_vkWaitSemaphore, nullptr); } if (m_xyBuffer != VK_NULL_HANDLE) { vkDestroyBuffer(m_device, m_xyBuffer, nullptr); } if (m_xyMemory != VK_NULL_HANDLE) { vkFreeMemory(m_device, m_xyMemory, nullptr); } if (m_heightBuffer != VK_NULL_HANDLE) { vkDestroyBuffer(m_device, m_heightBuffer, nullptr); } if (m_heightMemory != VK_NULL_HANDLE) { vkFreeMemory(m_device, m_heightMemory, nullptr); } if (m_cudaHeightMap) { checkCudaErrors(cudaDestroyExternalMemory(m_cudaVertMem)); } if (m_indexBuffer != VK_NULL_HANDLE) { vkDestroyBuffer(m_device, m_indexBuffer, nullptr); } if (m_indexMemory != VK_NULL_HANDLE) { vkFreeMemory(m_device, m_indexMemory, nullptr); } } void fillRenderingCommandBuffer(VkCommandBuffer &commandBuffer) { VkBuffer vertexBuffers[] = {m_heightBuffer, m_xyBuffer}; VkDeviceSize offsets[] = {0, 0}; vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets); vkCmdBindIndexBuffer(commandBuffer, m_indexBuffer, 0, VK_INDEX_TYPE_UINT32); vkCmdDrawIndexed(commandBuffer, (uint32_t)((m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6), 1, 0, 0, 0); } void getVertexDescriptions( std::vector &bindingDesc, std::vector &attribDesc) { bindingDesc.resize(2); attribDesc.resize(2); bindingDesc[0].binding = 0; bindingDesc[0].stride = sizeof(float); bindingDesc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; bindingDesc[1].binding = 1; bindingDesc[1].stride = sizeof(vec2); bindingDesc[1].inputRate = VK_VERTEX_INPUT_RATE_VERTEX; attribDesc[0].binding = 0; attribDesc[0].location = 0; attribDesc[0].format = VK_FORMAT_R32_SFLOAT; attribDesc[0].offset = 0; attribDesc[1].binding = 1; attribDesc[1].location = 1; attribDesc[1].format = VK_FORMAT_R32G32_SFLOAT; attribDesc[1].offset = 0; } void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo &info) { info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO; info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST; info.primitiveRestartEnable = VK_FALSE; } void getWaitFrameSemaphores( std::vector &wait, std::vector &waitStages) const { if (m_currentFrame != 0) { // Have vulkan wait until cuda is done with the vertex buffer before // rendering, We don't do this on the first frame, as the wait semaphore // hasn't been initialized yet wait.push_back(m_vkWaitSemaphore); // We want to wait until all the pipeline commands are complete before // letting cuda work waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); } } void getSignalFrameSemaphores(std::vector &signal) const { // Add this semaphore for vulkan to signal once the vertex buffer is ready // for cuda to modify signal.push_back(m_vkSignalSemaphore); } void initVulkanApp() { int cuda_device = -1; // Select cuda device where vulkan is running. cuda_device = m_sim.initCuda(m_vkDeviceUUID, VK_UUID_SIZE); if (cuda_device == -1) { printf("Error: No CUDA-Vulkan interop capable device found\n"); exit(EXIT_FAILURE); } m_sim.initCudaLaunchConfig(cuda_device); // Create the cuda stream we'll be using checkCudaErrors( cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); const size_t nVerts = m_sim.getWidth() * m_sim.getHeight(); const size_t nInds = (m_sim.getWidth() - 1) * (m_sim.getHeight() - 1) * 6; // Create the height map cuda will write to createExternalBuffer( nVerts * sizeof(float), VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, getDefaultMemHandleType(), m_heightBuffer, m_heightMemory); // Create the vertex buffer that will hold the xy coordinates for the grid createBuffer(nVerts * sizeof(vec2), VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyBuffer, m_xyMemory); // Create the index buffer that references from both buffers above createBuffer( nInds * sizeof(uint32_t), VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_indexBuffer, m_indexMemory); // Import the height map into cuda and retrieve a device pointer to use importCudaExternalMemory((void **)&m_cudaHeightMap, m_cudaVertMem, m_heightMemory, nVerts * sizeof(*m_cudaHeightMap), getDefaultMemHandleType()); // Set the height map to use in the simulation m_sim.initSimulation(m_cudaHeightMap); { // Set up the initial values for the vertex buffers with Vulkan void *stagingBase; VkBuffer stagingBuffer; VkDeviceMemory stagingMemory; VkDeviceSize stagingSz = std::max(nVerts * sizeof(vec2), nInds * sizeof(uint32_t)); createBuffer(stagingSz, VK_BUFFER_USAGE_TRANSFER_SRC_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, stagingBuffer, stagingMemory); vkMapMemory(m_device, stagingMemory, 0, stagingSz, 0, &stagingBase); memset(stagingBase, 0, nVerts * sizeof(float)); copyBuffer(m_heightBuffer, stagingBuffer, nVerts * sizeof(float)); for (size_t y = 0; y < m_sim.getHeight(); y++) { for (size_t x = 0; x < m_sim.getWidth(); x++) { vec2 *stagedVert = (vec2 *)stagingBase; stagedVert[y * m_sim.getWidth() + x][0] = (2.0f * x) / (m_sim.getWidth() - 1) - 1; stagedVert[y * m_sim.getWidth() + x][1] = (2.0f * y) / (m_sim.getHeight() - 1) - 1; } } copyBuffer(m_xyBuffer, stagingBuffer, nVerts * sizeof(vec2)); { uint32_t *indices = (uint32_t *)stagingBase; for (size_t y = 0; y < m_sim.getHeight() - 1; y++) { for (size_t x = 0; x < m_sim.getWidth() - 1; x++) { indices[0] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 0)); indices[1] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); indices[2] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); indices[3] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 0)); indices[4] = (uint32_t)((y + 1) * m_sim.getWidth() + (x + 1)); indices[5] = (uint32_t)((y + 0) * m_sim.getWidth() + (x + 1)); indices += 6; } } } copyBuffer(m_indexBuffer, stagingBuffer, nInds * sizeof(uint32_t)); vkUnmapMemory(m_device, stagingMemory); vkDestroyBuffer(m_device, stagingBuffer, nullptr); vkFreeMemory(m_device, stagingMemory, nullptr); } #ifdef _VK_TIMELINE_SEMAPHORE // Create the timeline semaphore to sync cuda and vulkan access to vertex // buffer createExternalSemaphore(m_vkTimelineSemaphore, getDefaultSemaphoreHandleType()); // Import the timeline semaphore cuda will use to sync cuda and vulkan // access to vertex buffer importCudaExternalSemaphore(m_cudaTimelineSemaphore, m_vkTimelineSemaphore, getDefaultSemaphoreHandleType()); #else // Create the semaphore vulkan will signal when it's done with the vertex // buffer createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); // Create the semaphore vulkan will wait for before using the vertex buffer createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); #endif /* _VK_TIMELINE_SEMAPHORE */ } void importCudaExternalMemory(void **cudaPtr, cudaExternalMemory_t &cudaMem, VkDeviceMemory &vkMem, VkDeviceSize size, VkExternalMemoryHandleTypeFlagBits handleType) { cudaExternalMemoryHandleDesc externalMemoryHandleDesc = {}; if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueWin32Kmt; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { externalMemoryHandleDesc.type = cudaExternalMemoryHandleTypeOpaqueFd; } else { throw std::runtime_error("Unknown handle type requested!"); } externalMemoryHandleDesc.size = size; #ifdef _WIN64 externalMemoryHandleDesc.handle.win32.handle = (HANDLE)getMemHandle(vkMem, handleType); #else externalMemoryHandleDesc.handle.fd = (int)(uintptr_t)getMemHandle(vkMem, handleType); #endif checkCudaErrors( cudaImportExternalMemory(&cudaMem, &externalMemoryHandleDesc)); cudaExternalMemoryBufferDesc externalMemBufferDesc = {}; externalMemBufferDesc.offset = 0; externalMemBufferDesc.size = size; externalMemBufferDesc.flags = 0; checkCudaErrors(cudaExternalMemoryGetMappedBuffer(cudaPtr, cudaMem, &externalMemBufferDesc)); } void importCudaExternalSemaphore( cudaExternalSemaphore_t &cudaSem, VkSemaphore &vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) { cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; #ifdef _VK_TIMELINE_SEMAPHORE if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd; } #else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; } #endif /* _VK_TIMELINE_SEMAPHORE */ else { throw std::runtime_error("Unknown handle type requested!"); } #ifdef _WIN64 externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType); #else externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); #endif externalSemaphoreHandleDesc.flags = 0; checkCudaErrors( cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); } VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); } void updateUniformBuffer(uint32_t imageIndex) { { mat4x4 view, proj; vec3 eye = {1.75f, 1.75f, 1.25f}; vec3 center = {0.0f, 0.0f, -0.25f}; vec3 up = {0.0f, 0.0f, 1.0f}; mat4x4_perspective( proj, (float)degreesToRadians(45.0f), m_swapChainExtent.width / (float)m_swapChainExtent.height, 0.1f, 10.0f); proj[1][1] *= -1.0f; // Flip y axis mat4x4_look_at(view, eye, center, up); mat4x4_mul(m_ubo.modelViewProj, proj, view); } void *data; vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data); memcpy(data, &m_ubo, sizeof(m_ubo)); vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); } std::vector getRequiredExtensions() const { std::vector extensions; extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); return extensions; } std::vector getRequiredDeviceExtensions() const { std::vector extensions; extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); extensions.push_back(VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME); #ifdef _WIN64 extensions.push_back(VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME); #else extensions.push_back(VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME); #endif /* _WIN64 */ return extensions; } void drawFrame() { static chrono_tp startTime = std::chrono::high_resolution_clock::now(); chrono_tp currentTime = std::chrono::high_resolution_clock::now(); float time = std::chrono::duration( currentTime - startTime) .count(); if (m_currentFrame == 0) { m_lastTime = startTime; } float frame_time = std::chrono::duration(currentTime - m_lastTime) .count(); // Have vulkan draw the current frame... VulkanBaseApp::drawFrame(); #ifdef _VK_TIMELINE_SEMAPHORE cudaExternalSemaphoreWaitParams waitParams = {}; waitParams.flags = 0; waitParams.params.fence.value = 1; cudaExternalSemaphoreSignalParams signalParams = {}; signalParams.flags = 0; signalParams.params.fence.value = 0; // Wait for vulkan to complete it's work checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaTimelineSemaphore, &waitParams, 1, m_stream)); // Now step the simulation m_sim.stepSimulation(time, m_stream); // Signal vulkan to continue with the updated buffers checkCudaErrors(cudaSignalExternalSemaphoresAsync( &m_cudaTimelineSemaphore, &signalParams, 1, m_stream)); #else cudaExternalSemaphoreWaitParams waitParams = {}; waitParams.flags = 0; waitParams.params.fence.value = 0; cudaExternalSemaphoreSignalParams signalParams = {}; signalParams.flags = 0; signalParams.params.fence.value = 0; // Wait for vulkan to complete it's work checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream)); // Now step the simulation m_sim.stepSimulation(time, m_stream); // Signal vulkan to continue with the updated buffers checkCudaErrors(cudaSignalExternalSemaphoresAsync( &m_cudaSignalSemaphore, &signalParams, 1, m_stream)); #endif /* _VK_TIMELINE_SEMAPHORE */ // Output a naive measurement of the frames per second every five seconds if (frame_time > 5) { std::cout << "Average FPS (over " << std::fixed << std::setprecision(2) << frame_time << " seconds): " << std::fixed << std::setprecision(2) << ((m_currentFrame - m_lastFrame) / frame_time) << std::endl; m_lastFrame = m_currentFrame; m_lastTime = currentTime; } } }; int main(int argc, char **argv) { execution_path = argv[0]; VulkanCudaSineWave app((1ULL << 8ULL), (1ULL << 8ULL)); app.init(); app.mainLoop(); return 0; }