update vulkan samples with SPIR-V shaders

This commit is contained in:
Rutwik Choughule 2021-06-02 17:17:21 +05:30
parent 5c3ec60fae
commit 7a5b3e6c8c
17 changed files with 2368 additions and 2116 deletions

View File

@ -19,8 +19,17 @@ For Linux:
-- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
For Linux aarch64(L4T): For Linux aarch64(L4T):
-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3
-- install above will also provide libvulkan-dev as dependencies -- install above will also provide libvulkan-dev as dependencies
-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr" -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
For Shader changes:
-- Update the sinewave.vert and/or sinewave.frag shader source file as required
-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
glslc sinewave.vert -o vert.spv
glslc sinewave.frag -o frag.spv
** Make sure to add glslc's path in your PATH environment variable **

Binary file not shown.

View File

@ -92,9 +92,9 @@ class VulkanCudaSineWave : public VulkanBaseApp {
} }
// Add our compiled vulkan shader files // Add our compiled vulkan shader files
char *vertex_shader_path = char *vertex_shader_path =
sdkFindFilePath("sinewave.vert", execution_path.c_str()); sdkFindFilePath("vert.spv", execution_path.c_str());
char *fragment_shader_path = char *fragment_shader_path =
sdkFindFilePath("sinewave.frag", execution_path.c_str()); sdkFindFilePath("frag.spv", execution_path.c_str());
m_shaderFiles.push_back( m_shaderFiles.push_back(
std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
m_shaderFiles.push_back( m_shaderFiles.push_back(

Binary file not shown.

View File

@ -0,0 +1,35 @@
For Windows:
Follow these steps once you have installed Vulkan SDK for Windows from https://www.lunarg.com/vulkan-sdk/
-- Install GLFW3 library at suitable location
-- Open the simpleVulkan VS project file.
To add the GLFW3 library path
-- Right click on Project name "simpleVulkan" click on "Properties"
-- In Property pages window go to Linker -> General. Here in "Additional Libraries Directories" edit and add path to glfw3dll.lib
To add the GLFW3 headers path
-- Right click on Project name "simpleVulkan" click on "Properties"
-- In Property pages window go to "VC++ Directories" section. Here in "Include Directories" edit and add path to GLFW3 headers include directory location.
** Make sure to add path to glfw3.dll in your PATH environment variable**
For Linux:
-- Install the Vulkan SDK from https://www.lunarg.com/vulkan-sdk/ and follow environment setup instructions.
-- Install GLFW3 library through your OS package repository. For example: apt-get for Ubuntu and dnf for RHEL/CentOS. Below is for Ubuntu:
sudo apt-get install libglfw3
sudo apt-get install libglfw3-dev
-- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
For Linux aarch64(L4T):
-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3
-- install above will also provide libvulkan-dev as dependencies
-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
For Shader changes:
-- Update the montecarlo.vert and/or montecarlo.frag shader source file as required
-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
glslc montecarlo.vert -o vert.spv
glslc montecarlo.frag -o frag.spv
** Make sure to add glslc's path in your PATH environment variable **

View File

@ -25,7 +25,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
/* /*
* See: https://www.piday.org/million/ * See: https://www.piday.org/million/
*/ */
@ -37,15 +37,16 @@
#define ROUND_UP_TO_GRANULARITY(x, n) (((x + n - 1) / n) * n) #define ROUND_UP_TO_GRANULARITY(x, n) (((x + n - 1) / n) * n)
// `ipcHandleTypeFlag` specifies the platform specific handle type this sample // `ipcHandleTypeFlag` specifies the platform specific handle type this sample
// uses for importing and exporting memory allocation. On Linux this sample // uses for importing and exporting memory allocation. On Linux this sample
// specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that // specifies the type as CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR meaning that
// file descriptors will be used. On Windows this sample specifies the type as // file descriptors will be used. On Windows this sample specifies the type as
// CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The // CU_MEM_HANDLE_TYPE_WIN32 meaning that NT HANDLEs will be used. The
// ipcHandleTypeFlag variable is a convenience variable and is passed by value // ipcHandleTypeFlag variable is a convenience variable and is passed by value
// to individual requests. // to individual requests.
#if defined(__linux__) #if defined(__linux__)
CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; CUmemAllocationHandleType ipcHandleTypeFlag =
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
#else #else
CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32; CUmemAllocationHandleType ipcHandleTypeFlag = CU_MEM_HANDLE_TYPE_WIN32;
#endif #endif
@ -78,8 +79,9 @@ void getDefaultSecurityDescriptor(CUmemAllocationProp *prop) {
#endif #endif
} }
__global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, float *numPointsInCircle, unsigned int numPoints, float time) __global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle,
{ float *numPointsInCircle,
unsigned int numPoints, float time) {
const size_t stride = gridDim.x * blockDim.x; const size_t stride = gridDim.x * blockDim.x;
size_t tid = blockIdx.x * blockDim.x + threadIdx.x; size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
float count = 0.0f; float count = 0.0f;
@ -96,26 +98,24 @@ __global__ void monte_carlo_kernel(vec2 *xyVector, float *pointsInsideCircle, fl
xyVector[tid][1] = y; xyVector[tid][1] = y;
// Compute the distance of this point form the center(0, 0) // Compute the distance of this point form the center(0, 0)
float dist = sqrtf((x*x) + (y*y)); float dist = sqrtf((x * x) + (y * y));
// If distance is less than the radius of the unit circle, the point lies in the circle. // If distance is less than the radius of the unit circle, the point lies in
// the circle.
pointsInsideCircle[tid] = (dist <= 1.0f); pointsInsideCircle[tid] = (dist <= 1.0f);
count += (dist <= 1.0f); count += (dist <= 1.0f);
} }
atomicAdd(numPointsInCircle, count); atomicAdd(numPointsInCircle, count);
} }
MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points) : MonteCarloPiSimulation::MonteCarloPiSimulation(size_t num_points)
m_xyVector(nullptr), : m_xyVector(nullptr),
m_pointsInsideCircle(nullptr), m_pointsInsideCircle(nullptr),
m_totalPointsInsideCircle(0), m_totalPointsInsideCircle(0),
m_totalPointsSimulated(0), m_totalPointsSimulated(0),
m_numPoints(num_points) m_numPoints(num_points) {}
{
}
MonteCarloPiSimulation::~MonteCarloPiSimulation() MonteCarloPiSimulation::~MonteCarloPiSimulation() {
{
if (m_numPointsInCircle) { if (m_numPointsInCircle) {
checkCudaErrors(cudaFree(m_numPointsInCircle)); checkCudaErrors(cudaFree(m_numPointsInCircle));
m_numPointsInCircle = nullptr; m_numPointsInCircle = nullptr;
@ -128,70 +128,82 @@ MonteCarloPiSimulation::~MonteCarloPiSimulation()
cleanupSimulationAllocations(); cleanupSimulationAllocations();
} }
void MonteCarloPiSimulation::initSimulation(int cudaDevice, cudaStream_t stream) void MonteCarloPiSimulation::initSimulation(int cudaDevice,
{ cudaStream_t stream) {
m_cudaDevice = cudaDevice; m_cudaDevice = cudaDevice;
getIdealExecutionConfiguration(); getIdealExecutionConfiguration();
// Allocate a position buffer that contains random location of the points in XY cartesian plane. // Allocate a position buffer that contains random location of the points in
// Allocate a bitmap buffer which holds information of whether a point in the position buffer is inside the unit circle or not. // XY cartesian plane.
// Allocate a bitmap buffer which holds information of whether a point in the
// position buffer is inside the unit circle or not.
setupSimulationAllocations(); setupSimulationAllocations();
checkCudaErrors(cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle))); checkCudaErrors(
checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle, sizeof(*m_hostNumPointsInCircle))); cudaMalloc((float **)&m_numPointsInCircle, sizeof(*m_numPointsInCircle)));
checkCudaErrors(cudaMallocHost((float **)&m_hostNumPointsInCircle,
sizeof(*m_hostNumPointsInCircle)));
} }
void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) void MonteCarloPiSimulation::stepSimulation(float time, cudaStream_t stream) {
{ checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0,
sizeof(*m_numPointsInCircle), stream));
checkCudaErrors(cudaMemsetAsync(m_numPointsInCircle, 0, sizeof(*m_numPointsInCircle), stream)); monte_carlo_kernel<<<m_blocks, m_threads, 0, stream>>>(
m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
monte_carlo_kernel << < m_blocks, m_threads, 0, stream >> > (m_xyVector, m_pointsInsideCircle, m_numPointsInCircle, m_numPoints, time);
getLastCudaError("Failed to launch CUDA simulation"); getLastCudaError("Failed to launch CUDA simulation");
checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle, sizeof(*m_numPointsInCircle), cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaMemcpyAsync(m_hostNumPointsInCircle, m_numPointsInCircle,
sizeof(*m_numPointsInCircle),
cudaMemcpyDeviceToHost, stream));
// Queue up a stream callback to compute and print the PI value. // Queue up a stream callback to compute and print the PI value.
checkCudaErrors(cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this)); checkCudaErrors(
cudaLaunchHostFunc(stream, this->computePiCallback, (void *)this));
} }
void MonteCarloPiSimulation::computePiCallback(void *args) void MonteCarloPiSimulation::computePiCallback(void *args) {
{
MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args; MonteCarloPiSimulation *cbData = (MonteCarloPiSimulation *)args;
cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle); cbData->m_totalPointsInsideCircle += *(cbData->m_hostNumPointsInCircle);
cbData->m_totalPointsSimulated += cbData->m_numPoints; cbData->m_totalPointsSimulated += cbData->m_numPoints;
double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle / (double)cbData->m_totalPointsSimulated); double piValue = 4.0 * ((double)cbData->m_totalPointsInsideCircle /
printf("Approximate Pi value for %zd data points: %lf \n", cbData->m_totalPointsSimulated, piValue); (double)cbData->m_totalPointsSimulated);
printf("Approximate Pi value for %zd data points: %lf \n",
cbData->m_totalPointsSimulated, piValue);
} }
void MonteCarloPiSimulation::getIdealExecutionConfiguration() void MonteCarloPiSimulation::getIdealExecutionConfiguration() {
{
int warpSize = 0; int warpSize = 0;
int multiProcessorCount = 0; int multiProcessorCount = 0;
checkCudaErrors(cudaSetDevice(m_cudaDevice)); checkCudaErrors(cudaSetDevice(m_cudaDevice));
checkCudaErrors(cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice)); checkCudaErrors(
cudaDeviceGetAttribute(&warpSize, cudaDevAttrWarpSize, m_cudaDevice));
// We don't need large block sizes, since there's not much inter-thread communication // We don't need large block sizes, since there's not much inter-thread
// communication
m_threads = warpSize; m_threads = warpSize;
// Use the occupancy calculator and fill the gpu as best as we can // Use the occupancy calculator and fill the gpu as best as we can
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&m_blocks, monte_carlo_kernel, warpSize, 0)); checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&m_blocks, monte_carlo_kernel, warpSize, 0));
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice)); checkCudaErrors(cudaDeviceGetAttribute(
&multiProcessorCount, cudaDevAttrMultiProcessorCount, m_cudaDevice));
m_blocks *= multiProcessorCount; m_blocks *= multiProcessorCount;
// Go ahead and the clamp the blocks to the minimum needed for this height/width // Go ahead and the clamp the blocks to the minimum needed for this
m_blocks = std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads)); // height/width
m_blocks =
std::min(m_blocks, (int)((m_numPoints + m_threads - 1) / m_threads));
} }
void MonteCarloPiSimulation::setupSimulationAllocations() void MonteCarloPiSimulation::setupSimulationAllocations() {
{
CUdeviceptr d_ptr = 0U; CUdeviceptr d_ptr = 0U;
size_t granularity = 0; size_t granularity = 0;
CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle; CUmemGenericAllocationHandle cudaPositionHandle, cudaInCircleHandle;
CUmemAllocationProp allocProp = { }; CUmemAllocationProp allocProp = {};
allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED; allocProp.type = CU_MEM_ALLOCATION_TYPE_PINNED;
allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; allocProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
allocProp.location.id = m_cudaDevice; allocProp.location.id = m_cudaDevice;
@ -205,30 +217,39 @@ void MonteCarloPiSimulation::setupSimulationAllocations()
getDefaultSecurityDescriptor(&allocProp); getDefaultSecurityDescriptor(&allocProp);
// Get the recommended granularity for m_cudaDevice. // Get the recommended granularity for m_cudaDevice.
checkCudaErrors(cuMemGetAllocationGranularity(&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); checkCudaErrors(cuMemGetAllocationGranularity(
&granularity, &allocProp, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector); size_t xyPositionVecSize = m_numPoints * sizeof(*m_xyVector);
size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle); size_t inCircleVecSize = m_numPoints * sizeof(*m_pointsInsideCircle);
size_t xyPositionSize = ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity); size_t xyPositionSize =
ROUND_UP_TO_GRANULARITY(xyPositionVecSize, granularity);
size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity); size_t inCircleSize = ROUND_UP_TO_GRANULARITY(inCircleVecSize, granularity);
m_totalAllocationSize = (xyPositionSize + inCircleSize); m_totalAllocationSize = (xyPositionSize + inCircleSize);
// Reserve the required contiguous VA space for the allocations // Reserve the required contiguous VA space for the allocations
checkCudaErrors(cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0)); checkCudaErrors(
cuMemAddressReserve(&d_ptr, m_totalAllocationSize, granularity, 0U, 0));
// Create the allocations as a pinned allocation on this device. // Create the allocations as a pinned allocation on this device.
// Create an allocation to store all the positions of points on the xy plane and a second // Create an allocation to store all the positions of points on the xy plane
// allocation which stores information if the corresponding position is inside the unit circle or not. // and a second allocation which stores information if the corresponding
checkCudaErrors(cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0)); // position is inside the unit circle or not.
checkCudaErrors(cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0)); checkCudaErrors(
cuMemCreate(&cudaPositionHandle, xyPositionSize, &allocProp, 0));
checkCudaErrors(
cuMemCreate(&cudaInCircleHandle, inCircleSize, &allocProp, 0));
// Export the allocation to a platform-specific handle. The type of handle // Export the allocation to a platform-specific handle. The type of handle
// requested here must match the requestedHandleTypes field in the prop // requested here must match the requestedHandleTypes field in the prop
// structure passed to cuMemCreate. The handle obtained here will be passed to vulkan // structure passed to cuMemCreate. The handle obtained here will be passed to
// to import the allocation. // vulkan to import the allocation.
checkCudaErrors(cuMemExportToShareableHandle((void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0)); checkCudaErrors(cuMemExportToShareableHandle(
checkCudaErrors(cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle, cudaInCircleHandle, ipcHandleTypeFlag, 0)); (void *)&m_posShareableHandle, cudaPositionHandle, ipcHandleTypeFlag, 0));
checkCudaErrors(
cuMemExportToShareableHandle((void *)&m_inCircleShareableHandle,
cudaInCircleHandle, ipcHandleTypeFlag, 0));
CUdeviceptr va_position = d_ptr; CUdeviceptr va_position = d_ptr;
CUdeviceptr va_InCircle = va_position + xyPositionSize; CUdeviceptr va_InCircle = va_position + xyPositionSize;
@ -236,12 +257,15 @@ void MonteCarloPiSimulation::setupSimulationAllocations()
m_xyVector = (vec2 *)va_position; m_xyVector = (vec2 *)va_position;
// Assign the chunk to the appropriate VA range // Assign the chunk to the appropriate VA range
checkCudaErrors(cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0)); checkCudaErrors(
checkCudaErrors(cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0)); cuMemMap(va_position, xyPositionSize, 0, cudaPositionHandle, 0));
checkCudaErrors(
cuMemMap(va_InCircle, inCircleSize, 0, cudaInCircleHandle, 0));
// Release the handles for the allocation. Since the allocation is currently mapped to a VA range // Release the handles for the allocation. Since the allocation is currently
// with a previous call to cuMemMap the actual freeing of memory allocation will happen on an eventual call to // mapped to a VA range with a previous call to cuMemMap the actual freeing of
// cuMemUnmap. Thus the allocation will be kept live until it is unmapped. // memory allocation will happen on an eventual call to cuMemUnmap. Thus the
// allocation will be kept live until it is unmapped.
checkCudaErrors(cuMemRelease(cudaPositionHandle)); checkCudaErrors(cuMemRelease(cudaPositionHandle));
checkCudaErrors(cuMemRelease(cudaInCircleHandle)); checkCudaErrors(cuMemRelease(cudaInCircleHandle));
@ -250,12 +274,13 @@ void MonteCarloPiSimulation::setupSimulationAllocations()
accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE; accessDescriptor.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; accessDescriptor.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
// Apply the access descriptor to the whole VA range. Essentially enables Read-Write access to the range. // Apply the access descriptor to the whole VA range. Essentially enables
checkCudaErrors(cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1)); // Read-Write access to the range.
checkCudaErrors(
cuMemSetAccess(d_ptr, m_totalAllocationSize, &accessDescriptor, 1));
} }
void MonteCarloPiSimulation::cleanupSimulationAllocations() void MonteCarloPiSimulation::cleanupSimulationAllocations() {
{
if (m_xyVector && m_pointsInsideCircle) { if (m_xyVector && m_pointsInsideCircle) {
// Unmap the mapped virtual memory region // Unmap the mapped virtual memory region
// Since the handles to the mapped backing stores have already been released // Since the handles to the mapped backing stores have already been released
@ -267,7 +292,8 @@ void MonteCarloPiSimulation::cleanupSimulationAllocations()
checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle)); checkIpcErrors(ipcCloseShareableHandle(m_inCircleShareableHandle));
// Free the virtual address region. // Free the virtual address region.
checkCudaErrors(cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize)); checkCudaErrors(
cuMemAddressFree((CUdeviceptr)m_xyVector, m_totalAllocationSize));
m_xyVector = nullptr; m_xyVector = nullptr;
m_pointsInsideCircle = nullptr; m_pointsInsideCircle = nullptr;

View File

@ -39,33 +39,35 @@
typedef float vec2[2]; typedef float vec2[2];
class MonteCarloPiSimulation class MonteCarloPiSimulation {
{
size_t m_numPoints; size_t m_numPoints;
// Pointers to Cuda allocated buffers which are imported and used by vulkan as vertex buffer // Pointers to Cuda allocated buffers which are imported and used by vulkan as
// vertex buffer
vec2 *m_xyVector; vec2 *m_xyVector;
float *m_pointsInsideCircle; float *m_pointsInsideCircle;
// Pointers to device and host allocated memories storing number of points that are inside the unit circle // Pointers to device and host allocated memories storing number of points
// that are inside the unit circle
float *m_numPointsInCircle; float *m_numPointsInCircle;
float *m_hostNumPointsInCircle; float *m_hostNumPointsInCircle;
int m_blocks, m_threads; int m_blocks, m_threads;
// Total size of allocations created by cuMemMap Apis. This size is the sum of sizes of // Total size of allocations created by cuMemMap Apis. This size is the sum of
// m_xyVector and m_pointsInsideCircle buffers. // sizes of m_xyVector and m_pointsInsideCircle buffers.
size_t m_totalAllocationSize; size_t m_totalAllocationSize;
// Shareable Handles(a file descriptor on Linux and NT Handle on Windows), used for sharing cuda // Shareable Handles(a file descriptor on Linux and NT Handle on Windows),
// used for sharing cuda
// allocated memory with Vulkan // allocated memory with Vulkan
ShareableHandle m_posShareableHandle, m_inCircleShareableHandle; ShareableHandle m_posShareableHandle, m_inCircleShareableHandle;
// Cuda Device corresponding to the Vulkan Physical device // Cuda Device corresponding to the Vulkan Physical device
int m_cudaDevice; int m_cudaDevice;
// Track and accumulate total points that have been simulated since start of the sample. // Track and accumulate total points that have been simulated since start of
// The idea is to get a closer approximation to PI with time. // the sample. The idea is to get a closer approximation to PI with time.
size_t m_totalPointsInsideCircle; size_t m_totalPointsInsideCircle;
size_t m_totalPointsSimulated; size_t m_totalPointsSimulated;
@ -73,28 +75,21 @@ class MonteCarloPiSimulation
void cleanupSimulationAllocations(); void cleanupSimulationAllocations();
void getIdealExecutionConfiguration(); void getIdealExecutionConfiguration();
public: public:
MonteCarloPiSimulation(size_t num_points); MonteCarloPiSimulation(size_t num_points);
~MonteCarloPiSimulation(); ~MonteCarloPiSimulation();
void initSimulation(int cudaDevice, cudaStream_t stream = 0); void initSimulation(int cudaDevice, cudaStream_t stream = 0);
void stepSimulation(float time, cudaStream_t stream = 0); void stepSimulation(float time, cudaStream_t stream = 0);
static void computePiCallback(void *args); static void computePiCallback(void *args);
size_t getNumPoints() const { size_t getNumPoints() const { return m_numPoints; }
return m_numPoints;
}
float getNumPointsInCircle() const { float getNumPointsInCircle() const { return *m_hostNumPointsInCircle; }
return *m_hostNumPointsInCircle;
}
ShareableHandle &getPositionShareableHandle() { ShareableHandle &getPositionShareableHandle() { return m_posShareableHandle; }
return m_posShareableHandle;
}
ShareableHandle &getInCircleShareableHandle() { ShareableHandle &getInCircleShareableHandle() {
return m_inCircleShareableHandle; return m_inCircleShareableHandle;
} }
}; };
#endif // __PISIM_H__ #endif // __PISIM_H__

File diff suppressed because it is too large Load Diff

View File

@ -40,26 +40,38 @@
struct GLFWwindow; struct GLFWwindow;
class VulkanBaseApp class VulkanBaseApp {
{ public:
public:
VulkanBaseApp(const std::string& appName, bool enableValidation = false); VulkanBaseApp(const std::string& appName, bool enableValidation = false);
static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType(); static VkExternalSemaphoreHandleTypeFlagBits getDefaultSemaphoreHandleType();
static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType(); static VkExternalMemoryHandleTypeFlagBits getDefaultMemHandleType();
virtual ~VulkanBaseApp(); virtual ~VulkanBaseApp();
void init(); void init();
void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType); void* getMemHandle(VkDeviceMemory memory,
void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); VkExternalMemoryHandleTypeFlagBits handleType);
bool isVkPhysicalDeviceUuid(void *Uuid); void* getSemaphoreHandle(VkSemaphore semaphore,
void createExternalSemaphore(VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); VkExternalSemaphoreHandleTypeFlagBits handleType);
void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& bufferMemory); bool isVkPhysicalDeviceUuid(void* Uuid);
void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkExternalMemoryHandleTypeFlagsKHR extMemHandleType, VkBuffer& buffer, VkDeviceMemory& bufferMemory); void createExternalSemaphore(
void importExternalBuffer(void *handle, VkExternalMemoryHandleTypeFlagBits handleType, size_t size, VkBufferUsageFlags usage, VkMemoryPropertyFlags properties, VkBuffer& buffer, VkDeviceMemory& memory); VkSemaphore& semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType);
void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
VkMemoryPropertyFlags properties, VkBuffer& buffer,
VkDeviceMemory& bufferMemory);
void createExternalBuffer(VkDeviceSize size, VkBufferUsageFlags usage,
VkMemoryPropertyFlags properties,
VkExternalMemoryHandleTypeFlagsKHR extMemHandleType,
VkBuffer& buffer, VkDeviceMemory& bufferMemory);
void importExternalBuffer(void* handle,
VkExternalMemoryHandleTypeFlagBits handleType,
size_t size, VkBufferUsageFlags usage,
VkMemoryPropertyFlags properties, VkBuffer& buffer,
VkDeviceMemory& memory);
void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size); void copyBuffer(VkBuffer dst, VkBuffer src, VkDeviceSize size);
VkCommandBuffer beginSingleTimeCommands(); VkCommandBuffer beginSingleTimeCommands();
void endSingleTimeCommands(VkCommandBuffer commandBuffer); void endSingleTimeCommands(VkCommandBuffer commandBuffer);
void mainLoop(); void mainLoop();
protected:
protected:
const std::string m_appName; const std::string m_appName;
const bool m_enableValidation; const bool m_enableValidation;
VkInstance m_instance; VkInstance m_instance;
@ -99,17 +111,23 @@ protected:
virtual void initVulkanApp() {} virtual void initVulkanApp() {}
virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {} virtual void fillRenderingCommandBuffer(VkCommandBuffer& buffer) {}
virtual std::vector<const char *> getRequiredExtensions() const; virtual std::vector<const char*> getRequiredExtensions() const;
virtual std::vector<const char *> getRequiredDeviceExtensions() const; virtual std::vector<const char*> getRequiredDeviceExtensions() const;
virtual void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc); virtual void getVertexDescriptions(
virtual void getAssemblyStateInfo(VkPipelineInputAssemblyStateCreateInfo& info); std::vector<VkVertexInputBindingDescription>& bindingDesc,
virtual void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const; std::vector<VkVertexInputAttributeDescription>& attribDesc);
virtual void getAssemblyStateInfo(
VkPipelineInputAssemblyStateCreateInfo& info);
virtual void getWaitFrameSemaphores(
std::vector<VkSemaphore>& wait,
std::vector<VkPipelineStageFlags>& waitStages) const;
virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const; virtual void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const;
virtual VkDeviceSize getUniformSize() const; virtual VkDeviceSize getUniformSize() const;
virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame); virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame);
virtual void drawFrame(); virtual void drawFrame();
private:
GLFWwindow *m_window; private:
GLFWwindow* m_window;
void initWindow(); void initWindow();
void initVulkan(); void initVulkan();
@ -134,7 +152,7 @@ private:
void recreateSwapChain(); void recreateSwapChain();
bool isSuitableDevice(VkPhysicalDevice dev) const; bool isSuitableDevice(VkPhysicalDevice dev) const;
static void resizeCallback(GLFWwindow *window, int width, int height); static void resizeCallback(GLFWwindow* window, int width, int height);
}; };
void readFile(std::istream& s, std::vector<char>& data); void readFile(std::istream& s, std::vector<char>& data);

View File

@ -35,13 +35,12 @@
#include <helper_cuda.h> #include <helper_cuda.h>
bool isDeviceCompatible(void *Uuid, size_t size) { bool isDeviceCompatible(void *Uuid, size_t size) {
int cudaDevice = cudaInvalidDeviceId; int cudaDevice = cudaInvalidDeviceId;
int deviceCount; int deviceCount;
checkCudaErrors(cudaGetDeviceCount(&deviceCount)); checkCudaErrors(cudaGetDeviceCount(&deviceCount));
for (int i = 0; i < deviceCount; ++i) { for (int i = 0; i < deviceCount; ++i) {
cudaDeviceProp devProp = { }; cudaDeviceProp devProp = {};
checkCudaErrors(cudaGetDeviceProperties(&devProp, i)); checkCudaErrors(cudaGetDeviceProperties(&devProp, i));
if (!memcmp(&devProp.uuid, Uuid, size)) { if (!memcmp(&devProp.uuid, Uuid, size)) {
cudaDevice = i; cudaDevice = i;
@ -56,20 +55,28 @@ bool isDeviceCompatible(void *Uuid, size_t size) {
int attributeVal = 0; int attributeVal = 0;
int deviceComputeMode = 0; int deviceComputeMode = 0;
checkCudaErrors(cuDeviceGetAttribute(&deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice)); checkCudaErrors(cuDeviceGetAttribute(
checkCudaErrors(cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cudaDevice)); &deviceComputeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, cudaDevice));
checkCudaErrors(cuDeviceGetAttribute(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
cudaDevice));
#if defined(__linux__) #if defined(__linux__)
checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED, cudaDevice)); checkCudaErrors(cuDeviceGetAttribute(
&deviceSupportsHandle,
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED,
cudaDevice));
#else #else
checkCudaErrors(cuDeviceGetAttribute(&deviceSupportsHandle, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice)); checkCudaErrors(cuDeviceGetAttribute(
&deviceSupportsHandle,
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED, cudaDevice));
#endif #endif
if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal || !deviceSupportsHandle) { if ((deviceComputeMode != CU_COMPUTEMODE_DEFAULT) || !attributeVal ||
!deviceSupportsHandle) {
return false; return false;
} }
return true; return true;
} }
#endif // __VKCUDA_H__ #endif // __VKCUDA_H__

Binary file not shown.

View File

@ -25,10 +25,11 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
/* /*
* This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs. * This sample demonstrates CUDA Interop with Vulkan using cuMemMap APIs.
* Allocating device memory and updating values in those allocations are performed by CUDA * Allocating device memory and updating values in those allocations are
* and the contents of the allocation are visualized by Vulkan. * performed by CUDA and the contents of the allocation are visualized by
* Vulkan.
*/ */
#include "VulkanBaseApp.h" #include "VulkanBaseApp.h"
@ -55,11 +56,8 @@
std::string execution_path; std::string execution_path;
class VulkanCudaPi : public VulkanBaseApp class VulkanCudaPi : public VulkanBaseApp {
{ typedef struct UniformBufferObject_st { float frame; } UniformBufferObject;
typedef struct UniformBufferObject_st {
float frame;
} UniformBufferObject;
VkBuffer m_inCircleBuffer, m_xyPositionBuffer; VkBuffer m_inCircleBuffer, m_xyPositionBuffer;
VkDeviceMemory m_inCircleMemory, m_xyPositionMemory; VkDeviceMemory m_inCircleMemory, m_xyPositionMemory;
@ -71,9 +69,10 @@ class VulkanCudaPi : public VulkanBaseApp
using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>; using chrono_tp = std::chrono::time_point<std::chrono::high_resolution_clock>;
chrono_tp m_lastTime; chrono_tp m_lastTime;
size_t m_lastFrame; size_t m_lastFrame;
public:
VulkanCudaPi(size_t num_points) : public:
VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION), VulkanCudaPi(size_t num_points)
: VulkanBaseApp("simpleVulkanMMAP", ENABLE_VALIDATION),
m_inCircleBuffer(VK_NULL_HANDLE), m_inCircleBuffer(VK_NULL_HANDLE),
m_xyPositionBuffer(VK_NULL_HANDLE), m_xyPositionBuffer(VK_NULL_HANDLE),
m_inCircleMemory(VK_NULL_HANDLE), m_inCircleMemory(VK_NULL_HANDLE),
@ -86,12 +85,15 @@ public:
m_cudaWaitSemaphore(), m_cudaWaitSemaphore(),
m_cudaSignalSemaphore(), m_cudaSignalSemaphore(),
m_lastFrame(0) { m_lastFrame(0) {
// Add our compiled vulkan shader files // Add our compiled vulkan shader files
char* vertex_shader_path = sdkFindFilePath("montecarlo.vert", execution_path.c_str()); char* vertex_shader_path =
char* fragment_shader_path = sdkFindFilePath("montecarlo.frag", execution_path.c_str()); sdkFindFilePath("vert.spv", execution_path.c_str());
m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path)); char* fragment_shader_path =
m_shaderFiles.push_back(std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path)); sdkFindFilePath("frag.spv", execution_path.c_str());
m_shaderFiles.push_back(
std::make_pair(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader_path));
m_shaderFiles.push_back(
std::make_pair(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader_path));
} }
~VulkanCudaPi() { ~VulkanCudaPi() {
@ -124,13 +126,17 @@ public:
} }
void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) { void fillRenderingCommandBuffer(VkCommandBuffer& commandBuffer) {
VkBuffer vertexBuffers[] = { m_inCircleBuffer, m_xyPositionBuffer }; VkBuffer vertexBuffers[] = {m_inCircleBuffer, m_xyPositionBuffer};
VkDeviceSize offsets[] = { 0, 0 }; VkDeviceSize offsets[] = {0, 0};
vkCmdBindVertexBuffers(commandBuffer, 0, sizeof(vertexBuffers) / sizeof(vertexBuffers[0]), vertexBuffers, offsets); vkCmdBindVertexBuffers(commandBuffer, 0,
sizeof(vertexBuffers) / sizeof(vertexBuffers[0]),
vertexBuffers, offsets);
vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0); vkCmdDraw(commandBuffer, (uint32_t)(m_sim.getNumPoints()), 1, 0, 0);
} }
void getVertexDescriptions(std::vector<VkVertexInputBindingDescription>& bindingDesc, std::vector<VkVertexInputAttributeDescription>& attribDesc) { void getVertexDescriptions(
std::vector<VkVertexInputBindingDescription>& bindingDesc,
std::vector<VkVertexInputAttributeDescription>& attribDesc) {
bindingDesc.resize(2); bindingDesc.resize(2);
attribDesc.resize(2); attribDesc.resize(2);
@ -159,30 +165,37 @@ public:
info.primitiveRestartEnable = VK_FALSE; info.primitiveRestartEnable = VK_FALSE;
} }
void getWaitFrameSemaphores(std::vector<VkSemaphore>& wait, std::vector< VkPipelineStageFlags>& waitStages) const { void getWaitFrameSemaphores(
std::vector<VkSemaphore>& wait,
std::vector<VkPipelineStageFlags>& waitStages) const {
if (m_currentFrame != 0) { if (m_currentFrame != 0) {
// Have vulkan wait until cuda is done with the vertex buffer before rendering // Have vulkan wait until cuda is done with the vertex buffer before
// We don't do this on the first frame, as the wait semaphore hasn't been initialized yet // rendering
// We don't do this on the first frame, as the wait semaphore hasn't been
// initialized yet
wait.push_back(m_vkWaitSemaphore); wait.push_back(m_vkWaitSemaphore);
// We want to wait until all the pipeline commands are complete before letting cuda work // We want to wait until all the pipeline commands are complete before
// letting cuda work
waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT); waitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
} }
} }
void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const { void getSignalFrameSemaphores(std::vector<VkSemaphore>& signal) const {
// Add this semaphore for vulkan to signal once the vertex buffer is ready for cuda to modify // Add this semaphore for vulkan to signal once the vertex buffer is ready
// for cuda to modify
signal.push_back(m_vkSignalSemaphore); signal.push_back(m_vkSignalSemaphore);
} }
void initVulkanApp() { void initVulkanApp() {
const size_t nVerts = m_sim.getNumPoints(); const size_t nVerts = m_sim.getNumPoints();
// Obtain cuda device id for the device corresponding to the Vulkan physical device // Obtain cuda device id for the device corresponding to the Vulkan physical
// device
int deviceCount; int deviceCount;
int cudaDevice = cudaInvalidDeviceId; int cudaDevice = cudaInvalidDeviceId;
checkCudaErrors(cudaGetDeviceCount(&deviceCount)); checkCudaErrors(cudaGetDeviceCount(&deviceCount));
for (int dev = 0; dev < deviceCount; ++dev) { for (int dev = 0; dev < deviceCount; ++dev) {
cudaDeviceProp devProp = { }; cudaDeviceProp devProp = {};
checkCudaErrors(cudaGetDeviceProperties(&devProp, dev)); checkCudaErrors(cudaGetDeviceProperties(&devProp, dev));
if (isVkPhysicalDeviceUuid(&devProp.uuid)) { if (isVkPhysicalDeviceUuid(&devProp.uuid)) {
cudaDevice = dev; cudaDevice = dev;
@ -195,75 +208,94 @@ public:
// On the corresponding cuda device, create the cuda stream we'll using // On the corresponding cuda device, create the cuda stream we'll using
checkCudaErrors(cudaSetDevice(cudaDevice)); checkCudaErrors(cudaSetDevice(cudaDevice));
checkCudaErrors(cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)); checkCudaErrors(
cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking));
m_sim.initSimulation(cudaDevice, m_stream); m_sim.initSimulation(cudaDevice, m_stream);
importExternalBuffer((void *)(uintptr_t)m_sim.getPositionShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(vec2), importExternalBuffer(
(void*)(uintptr_t)m_sim.getPositionShareableHandle(),
getDefaultMemHandleType(), nVerts * sizeof(vec2),
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer, m_xyPositionMemory); VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_xyPositionBuffer,
m_xyPositionMemory);
importExternalBuffer((void *)(uintptr_t)m_sim.getInCircleShareableHandle(), getDefaultMemHandleType(), nVerts * sizeof(float), importExternalBuffer(
(void*)(uintptr_t)m_sim.getInCircleShareableHandle(),
getDefaultMemHandleType(), nVerts * sizeof(float),
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer, m_inCircleMemory); VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, m_inCircleBuffer,
m_inCircleMemory);
// Create the semaphore vulkan will signal when it's done with the vertex buffer // Create the semaphore vulkan will signal when it's done with the vertex
createExternalSemaphore(m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); // buffer
createExternalSemaphore(m_vkSignalSemaphore,
getDefaultSemaphoreHandleType());
// Create the semaphore vulkan will wait for before using the vertex buffer // Create the semaphore vulkan will wait for before using the vertex buffer
createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); createExternalSemaphore(m_vkWaitSemaphore, getDefaultSemaphoreHandleType());
// Import the semaphore cuda will use -- vulkan's signal will be cuda's wait // Import the semaphore cuda will use -- vulkan's signal will be cuda's wait
importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore, getDefaultSemaphoreHandleType()); importCudaExternalSemaphore(m_cudaWaitSemaphore, m_vkSignalSemaphore,
getDefaultSemaphoreHandleType());
// Import the semaphore cuda will use -- cuda's signal will be vulkan's wait // Import the semaphore cuda will use -- cuda's signal will be vulkan's wait
importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore, getDefaultSemaphoreHandleType()); importCudaExternalSemaphore(m_cudaSignalSemaphore, m_vkWaitSemaphore,
getDefaultSemaphoreHandleType());
} }
void importCudaExternalSemaphore(cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem, VkExternalSemaphoreHandleTypeFlagBits handleType) { void importCudaExternalSemaphore(
cudaExternalSemaphore_t& cudaSem, VkSemaphore& vkSem,
VkExternalSemaphoreHandleTypeFlagBits handleType) {
cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {}; cudaExternalSemaphoreHandleDesc externalSemaphoreHandleDesc = {};
if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) { if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT) {
externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32; externalSemaphoreHandleDesc.type =
} cudaExternalSemaphoreHandleTypeOpaqueWin32;
else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) { } else if (handleType &
externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt; VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT_BIT) {
} externalSemaphoreHandleDesc.type =
else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) { cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt;
externalSemaphoreHandleDesc.type = cudaExternalSemaphoreHandleTypeOpaqueFd; } else if (handleType & VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT) {
} externalSemaphoreHandleDesc.type =
else { cudaExternalSemaphoreHandleTypeOpaqueFd;
} else {
throw std::runtime_error("Unknown handle type requested!"); throw std::runtime_error("Unknown handle type requested!");
} }
#ifdef _WIN64 #ifdef _WIN64
externalSemaphoreHandleDesc.handle.win32.handle = (HANDLE)getSemaphoreHandle(vkSem, handleType); externalSemaphoreHandleDesc.handle.win32.handle =
(HANDLE)getSemaphoreHandle(vkSem, handleType);
#else #else
externalSemaphoreHandleDesc.handle.fd = (int)(uintptr_t)getSemaphoreHandle(vkSem, handleType); externalSemaphoreHandleDesc.handle.fd =
(int)(uintptr_t)getSemaphoreHandle(vkSem, handleType);
#endif #endif
externalSemaphoreHandleDesc.flags = 0; externalSemaphoreHandleDesc.flags = 0;
checkCudaErrors(cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc)); checkCudaErrors(
cudaImportExternalSemaphore(&cudaSem, &externalSemaphoreHandleDesc));
} }
VkDeviceSize getUniformSize() const { VkDeviceSize getUniformSize() const { return sizeof(UniformBufferObject); }
return sizeof(UniformBufferObject);
}
void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) { void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame) {
m_ubo.frame = (float)globalFrame; m_ubo.frame = (float)globalFrame;
void *data; void* data;
vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0, &data); vkMapMemory(m_device, m_uniformMemory[imageIndex], 0, getUniformSize(), 0,
&data);
memcpy(data, &m_ubo, sizeof(m_ubo)); memcpy(data, &m_ubo, sizeof(m_ubo));
vkUnmapMemory(m_device, m_uniformMemory[imageIndex]); vkUnmapMemory(m_device, m_uniformMemory[imageIndex]);
} }
std::vector<const char *> getRequiredExtensions() const { std::vector<const char*> getRequiredExtensions() const {
std::vector<const char *> extensions; std::vector<const char*> extensions;
extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME);
extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME);
extensions.push_back(VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME);
extensions.push_back(
VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME);
return extensions; return extensions;
} }
std::vector<const char *> getRequiredDeviceExtensions() const { std::vector<const char*> getRequiredDeviceExtensions() const {
std::vector<const char *> extensions; std::vector<const char*> extensions;
extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME);
extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME); extensions.push_back(VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME);
@ -281,7 +313,9 @@ public:
static chrono_tp startTime = std::chrono::high_resolution_clock::now(); static chrono_tp startTime = std::chrono::high_resolution_clock::now();
chrono_tp currentTime = std::chrono::high_resolution_clock::now(); chrono_tp currentTime = std::chrono::high_resolution_clock::now();
float time = std::chrono::duration<float, std::chrono::seconds::period>(currentTime - startTime).count(); float time = std::chrono::duration<float, std::chrono::seconds::period>(
currentTime - startTime)
.count();
if (m_currentFrame == 0) { if (m_currentFrame == 0) {
m_lastTime = startTime; m_lastTime = startTime;
@ -298,17 +332,18 @@ public:
// Have vulkan draw the current frame... // Have vulkan draw the current frame...
VulkanBaseApp::drawFrame(); VulkanBaseApp::drawFrame();
// Wait for vulkan to complete it's work // Wait for vulkan to complete it's work
checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore, &waitParams, 1, m_stream)); checkCudaErrors(cudaWaitExternalSemaphoresAsync(&m_cudaWaitSemaphore,
&waitParams, 1, m_stream));
// Now step the simulation // Now step the simulation
m_sim.stepSimulation(time, m_stream); m_sim.stepSimulation(time, m_stream);
// Signal vulkan to continue with the updated buffers // Signal vulkan to continue with the updated buffers
checkCudaErrors(cudaSignalExternalSemaphoresAsync(&m_cudaSignalSemaphore, &signalParams, 1, m_stream)); checkCudaErrors(cudaSignalExternalSemaphoresAsync(
&m_cudaSignalSemaphore, &signalParams, 1, m_stream));
} }
}; };
int main(int argc, char **argv) int main(int argc, char** argv) {
{
execution_path = argv[0]; execution_path = argv[0];
VulkanCudaPi app(NUM_SIMULATION_POINTS); VulkanCudaPi app(NUM_SIMULATION_POINTS);
app.init(); app.init();

Binary file not shown.

View File

@ -19,8 +19,17 @@ For Linux:
-- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it -- Install "libxcb1-dev" and "xorg-dev" as GLFW3 is depended on it
-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
For Linux aarch64(L4T): For Linux aarch64(L4T):
-- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3 -- Install GLFW3 library using "sudo apt-get install libglfw3-dev" this will provide glfw3
-- install above will also provide libvulkan-dev as dependencies -- install above will also provide libvulkan-dev as dependencies
-- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH -- Add Vulkan and GLFW3 libraries directories to LD_LIBRARY_PATH
-- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr" -- Pass path to vulkan sdk while building 'make VULKAN_SDK_PATH=<PATH_TO_VULKAN_SDK>', VULKAN_SDK_PATH in this scenario is typically "/usr"
For Shader changes:
-- Update the shader.vert and/or shader.frag shader source file as required
-- Use the glslc shader compiler from the installed Vulkan SDK's bin directory to compile shaders as:
glslc shader.vert -o vert.spv
glslc shader.frag -o frag.spv
** Make sure to add glslc's path in your PATH environment variable **

Binary file not shown.

Binary file not shown.

View File

@ -69,7 +69,7 @@ const std::vector<const char*> validationLayers = {
"VK_LAYER_KHRONOS_validation"}; "VK_LAYER_KHRONOS_validation"};
#ifdef NDEBUG #ifdef NDEBUG
const bool enableValidationLayers = false; const bool enableValidationLayers = true;
#else #else
const bool enableValidationLayers = false; const bool enableValidationLayers = false;
#endif #endif
@ -494,7 +494,7 @@ class vulkanImageCUDA {
unsigned int* image_data = NULL; unsigned int* image_data = NULL;
unsigned int imageWidth, imageHeight; unsigned int imageWidth, imageHeight;
unsigned int mipLevels; unsigned int mipLevels = 1;
size_t totalImageMemSize; size_t totalImageMemSize;
// CUDA objects // CUDA objects
@ -630,6 +630,9 @@ class vulkanImageCUDA {
vkDestroyBuffer(device, vertexBuffer, nullptr); vkDestroyBuffer(device, vertexBuffer, nullptr);
vkFreeMemory(device, vertexBufferMemory, nullptr); vkFreeMemory(device, vertexBufferMemory, nullptr);
vkDestroySemaphore(device, cudaUpdateVkSemaphore, nullptr);
vkDestroySemaphore(device, vkUpdateCudaSemaphore, nullptr);
for (size_t i = 0; i < MAX_FRAMES; i++) { for (size_t i = 0; i < MAX_FRAMES; i++) {
vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr); vkDestroySemaphore(device, renderFinishedSemaphores[i], nullptr);
vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr); vkDestroySemaphore(device, imageAvailableSemaphores[i], nullptr);
@ -686,7 +689,7 @@ class vulkanImageCUDA {
appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0); appInfo.applicationVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.pEngineName = "No Engine"; appInfo.pEngineName = "No Engine";
appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0); appInfo.engineVersion = VK_MAKE_VERSION(1, 0, 0);
appInfo.apiVersion = VK_API_VERSION_1_0; appInfo.apiVersion = VK_API_VERSION_1_1;
VkInstanceCreateInfo createInfo = {}; VkInstanceCreateInfo createInfo = {};
createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; createInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
@ -905,6 +908,7 @@ class vulkanImageCUDA {
} }
VkPhysicalDeviceFeatures deviceFeatures = {}; VkPhysicalDeviceFeatures deviceFeatures = {};
deviceFeatures.samplerAnisotropy = VK_TRUE;
VkDeviceCreateInfo createInfo = {}; VkDeviceCreateInfo createInfo = {};
createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; createInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
@ -1078,8 +1082,8 @@ class vulkanImageCUDA {
} }
void createGraphicsPipeline() { void createGraphicsPipeline() {
auto vertShaderCode = readFile("shader.vert"); auto vertShaderCode = readFile("vert.spv");
auto fragShaderCode = readFile("shader.frag"); auto fragShaderCode = readFile("frag.spv");
VkShaderModule vertShaderModule = createShaderModule(vertShaderCode); VkShaderModule vertShaderModule = createShaderModule(vertShaderCode);
VkShaderModule fragShaderModule = createShaderModule(fragShaderCode); VkShaderModule fragShaderModule = createShaderModule(fragShaderCode);
@ -1268,7 +1272,7 @@ class vulkanImageCUDA {
// VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT // VK_FORMAT_R8G8B8A8_UNORM changed to VK_FORMAT_R8G8B8A8_UINT
createImage( createImage(
imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UINT, imageWidth, imageHeight, VK_FORMAT_R8G8B8A8_UNORM,
VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_TILING_OPTIMAL,
VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT |
VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT,
@ -1280,9 +1284,6 @@ class vulkanImageCUDA {
copyBufferToImage(stagingBuffer, textureImage, copyBufferToImage(stagingBuffer, textureImage,
static_cast<uint32_t>(imageWidth), static_cast<uint32_t>(imageWidth),
static_cast<uint32_t>(imageHeight)); static_cast<uint32_t>(imageHeight));
transitionImageLayout(textureImage, VK_FORMAT_R8G8B8A8_UINT,
VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
vkDestroyBuffer(device, stagingBuffer, nullptr); vkDestroyBuffer(device, stagingBuffer, nullptr);
vkFreeMemory(device, stagingBufferMemory, nullptr); vkFreeMemory(device, stagingBufferMemory, nullptr);
@ -1523,8 +1524,13 @@ class vulkanImageCUDA {
vkExternalMemImageCreateInfo.sType = vkExternalMemImageCreateInfo.sType =
VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO; VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
vkExternalMemImageCreateInfo.pNext = NULL; vkExternalMemImageCreateInfo.pNext = NULL;
#ifdef _WIN64
vkExternalMemImageCreateInfo.handleTypes =
VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
#else
vkExternalMemImageCreateInfo.handleTypes = vkExternalMemImageCreateInfo.handleTypes =
VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR; VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR;
#endif
imageInfo.pNext = &vkExternalMemImageCreateInfo; imageInfo.pNext = &vkExternalMemImageCreateInfo;
@ -2201,7 +2207,6 @@ class vulkanImageCUDA {
throw std::runtime_error( throw std::runtime_error(
"failed to create synchronization objects for a CUDA-Vulkan!"); "failed to create synchronization objects for a CUDA-Vulkan!");
} }
} }
void updateUniformBuffer() { void updateUniformBuffer() {
@ -2333,8 +2338,8 @@ class vulkanImageCUDA {
submitInfo.signalSemaphoreCount = 2; submitInfo.signalSemaphoreCount = 2;
submitInfo.pSignalSemaphores = signalSemaphores; submitInfo.pSignalSemaphores = signalSemaphores;
if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) != if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
VK_SUCCESS) { inFlightFences[currentFrame]) != VK_SUCCESS) {
throw std::runtime_error("failed to submit draw command buffer!"); throw std::runtime_error("failed to submit draw command buffer!");
} }
} }
@ -2360,8 +2365,8 @@ class vulkanImageCUDA {
submitInfo.signalSemaphoreCount = 2; submitInfo.signalSemaphoreCount = 2;
submitInfo.pSignalSemaphores = signalSemaphores; submitInfo.pSignalSemaphores = signalSemaphores;
if (vkQueueSubmit(graphicsQueue, 1, &submitInfo, inFlightFences[currentFrame]) != if (vkQueueSubmit(graphicsQueue, 1, &submitInfo,
VK_SUCCESS) { inFlightFences[currentFrame]) != VK_SUCCESS) {
throw std::runtime_error("failed to submit draw command buffer!"); throw std::runtime_error("failed to submit draw command buffer!");
} }
} }