Merge branch 'master_bug_fix' into 'master'

Bug fix for CUDA 13.2 samples

See merge request cuda-samples/cuda-samples!146
This commit is contained in:
Rob Armstrong 2026-01-21 09:07:07 -08:00
commit ce9407c240
16 changed files with 107 additions and 68 deletions

View File

@ -1,5 +1,8 @@
## Changelog ## Changelog
### CUDA 13.2
* Added the MSVC compile flag `-Xcompiler=/Zc:preprocessor` in CMakeLists.txt to comply with CUDA13.2 CCCL. Previously, using the traditional preprocessor triggered the warning “MSVC/cl.exe with traditional preprocessor is used…”, which now leads to a build error.
### CUDA 13.1 ### CUDA 13.1
* Minor bug fixes and enhancements, no structural or functional changes * Minor bug fixes and enhancements, no structural or functional changes

View File

@ -22,6 +22,11 @@ endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
# Add MSVC-specific flags for standard-conforming preprocessor (required for CCCL)
if(MSVC)
add_compile_options($<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/Zc:preprocessor>)
endif()
# Include installation configuration before processing samples # Include installation configuration before processing samples
include(cmake/InstallSamples.cmake) include(cmake/InstallSamples.cmake)

View File

@ -1,6 +1,6 @@
# CUDA Samples # CUDA Samples
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 13.1](https://developer.nvidia.com/cuda-downloads). Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 13.2](https://developer.nvidia.com/cuda-downloads).
## Release Notes ## Release Notes

View File

@ -25,6 +25,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <cstdint>
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
#include <iostream> #include <iostream>
@ -38,7 +39,7 @@ const int manualBlockSize = 32;
// execution configuration, including anything the launch configurator // execution configuration, including anything the launch configurator
// API suggests. // API suggests.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void square(int *array, int arrayCount) __global__ void square(uint32_t *array, int arrayCount)
{ {
extern __shared__ int dynamicSmem[]; extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; int idx = threadIdx.x + blockIdx.x * blockDim.x;
@ -99,7 +100,7 @@ static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynam
// This function configures the launch based on the "automatic" // This function configures the launch based on the "automatic"
// argument, records the runtime, and reports occupancy and runtime. // argument, records the runtime, and reports occupancy and runtime.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int launchConfig(int *array, int arrayCount, bool automatic) static int launchConfig(uint32_t *array, int arrayCount, bool automatic)
{ {
int blockSize; int blockSize;
int minGridSize; int minGridSize;
@ -166,20 +167,20 @@ static int launchConfig(int *array, int arrayCount, bool automatic)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int test(bool automaticLaunchConfig, const int count = 1000000) static int test(bool automaticLaunchConfig, const int count = 1000000)
{ {
int *array; uint32_t *array;
int *dArray; uint32_t *dArray;
int size = count * sizeof(int); int size = count * sizeof(uint32_t);
array = new int[count]; array = new uint32_t[count];
for (int i = 0; i < count; i += 1) { for (uint32_t i = 0; i < count; i += 1) {
array[i] = i; array[i] = i;
} }
checkCudaErrors(cudaMalloc(&dArray, size)); checkCudaErrors(cudaMalloc(&dArray, size));
checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
for (int i = 0; i < count; i += 1) { for (uint32_t i = 0; i < count; i += 1) {
array[i] = 0; array[i] = 0;
} }
@ -189,8 +190,9 @@ static int test(bool automaticLaunchConfig, const int count = 1000000)
checkCudaErrors(cudaFree(dArray)); checkCudaErrors(cudaFree(dArray));
// Verify the return data // Verify the return data
// Both GPU and CPU use uint32_t * uint32_t, which has well-defined overflow behavior (modulo 2^32)
// //
for (int i = 0; i < count; i += 1) { for (uint32_t i = 0; i < count; i += 1) {
if (array[i] != i * i) { if (array[i] != i * i) {
std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl; std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
return 1; return 1;

View File

@ -39,10 +39,7 @@ if(${OpenGL_FOUND})
# Add target for particles # Add target for particles
add_executable(particles particleSystem.cpp particleSystem_cuda.cu particles.cpp render_particles.cpp shaders.cpp) add_executable(particles particleSystem.cpp particleSystem_cuda.cu particles.cpp render_particles.cpp shaders.cpp)
target_compile_options(particles PRIVATE target_compile_options(particles PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
$<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>
$<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<BOOL:${MSVC}>>:-Xcompiler="/Zc:preprocessor">
)
target_compile_features(particles PRIVATE cxx_std_17 cuda_std_17) target_compile_features(particles PRIVATE cxx_std_17 cuda_std_17)

View File

@ -135,8 +135,8 @@ extern "C"
thrust::device_ptr<float4> d_pos4((float4 *)pos); thrust::device_ptr<float4> d_pos4((float4 *)pos);
thrust::device_ptr<float4> d_vel4((float4 *)vel); thrust::device_ptr<float4> d_vel4((float4 *)vel);
thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(d_pos4, d_vel4)), thrust::for_each(thrust::make_zip_iterator(d_pos4, d_vel4),
thrust::make_zip_iterator(thrust::make_tuple(d_pos4 + numParticles, d_vel4 + numParticles)), thrust::make_zip_iterator(d_pos4 + numParticles, d_vel4 + numParticles),
integrate_functor(deltaTime)); integrate_functor(deltaTime));
} }

View File

@ -41,6 +41,9 @@
#include "thrust/iterator/zip_iterator.h" #include "thrust/iterator/zip_iterator.h"
#include "thrust/sort.h" #include "thrust/sort.h"
// for cuda::std::get
#include <cuda/std/utility>
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#include "helper_math.h" #include "helper_math.h"
#include "math_constants.h" #include "math_constants.h"
@ -60,8 +63,8 @@ struct integrate_functor
template <typename Tuple> __device__ void operator()(Tuple t) template <typename Tuple> __device__ void operator()(Tuple t)
{ {
volatile float4 posData = thrust::get<0>(t); volatile float4 posData = cuda::std::get<0>(t);
volatile float4 velData = thrust::get<1>(t); volatile float4 velData = cuda::std::get<1>(t);
float3 pos = make_float3(posData.x, posData.y, posData.z); float3 pos = make_float3(posData.x, posData.y, posData.z);
float3 vel = make_float3(velData.x, velData.y, velData.z); float3 vel = make_float3(velData.x, velData.y, velData.z);
@ -107,8 +110,8 @@ struct integrate_functor
} }
// store new position and velocity // store new position and velocity
thrust::get<0>(t) = make_float4(pos, posData.w); cuda::std::get<0>(t) = make_float4(pos, posData.w);
thrust::get<1>(t) = make_float4(vel, velData.w); cuda::std::get<1>(t) = make_float4(vel, velData.w);
} }
}; };

View File

@ -505,13 +505,12 @@ private:
thrust::device_ptr<uint> dMinScannedEdges = pools.uintEdges.get(); thrust::device_ptr<uint> dMinScannedEdges = pools.uintEdges.get();
thrust::device_ptr<float> dMinScannedWeights = pools.floatEdges.get(); thrust::device_ptr<float> dMinScannedWeights = pools.floatEdges.get();
thrust::inclusive_scan_by_key( thrust::inclusive_scan_by_key(dEdgesFlags,
dEdgesFlags, dEdgesFlags + edgesCount_,
dEdgesFlags + edgesCount_, thrust::make_zip_iterator(dWeights_, dEdges_),
thrust::make_zip_iterator(thrust::make_tuple(dWeights_, dEdges_)), thrust::make_zip_iterator(dMinScannedWeights, dMinScannedEdges),
thrust::make_zip_iterator(thrust::make_tuple(dMinScannedWeights, dMinScannedEdges)), thrust::greater_equal<uint>(),
thrust::greater_equal<uint>(), thrust::minimum<cuda::std::tuple<float, uint>>());
thrust::minimum<thrust::tuple<float, uint>>());
// To make things clear. // To make things clear.
// Let "dEdgesFlags" denote groups of edges that // Let "dEdgesFlags" denote groups of edges that
@ -562,11 +561,10 @@ private:
thrust::sequence(dClusteredVerticesIDs, dClusteredVerticesIDs + verticesCount_); thrust::sequence(dClusteredVerticesIDs, dClusteredVerticesIDs + verticesCount_);
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr<uint>(dSuccessors), thrust::sort(thrust::make_zip_iterator(thrust::device_ptr<uint>(dSuccessors),
thrust::device_ptr<uint>(dClusteredVerticesIDs))), thrust::device_ptr<uint>(dClusteredVerticesIDs)),
thrust::make_zip_iterator( thrust::make_zip_iterator(thrust::device_ptr<uint>(dSuccessors + verticesCount_),
thrust::make_tuple(thrust::device_ptr<uint>(dSuccessors + verticesCount_), thrust::device_ptr<uint>(dClusteredVerticesIDs + verticesCount_)));
thrust::device_ptr<uint>(dClusteredVerticesIDs + verticesCount_))));
// Mark those groups. // Mark those groups.
thrust::device_ptr<uint> dVerticesFlags_ = pools.uintVertices.get(); thrust::device_ptr<uint> dVerticesFlags_ = pools.uintVertices.get();
@ -653,9 +651,8 @@ private:
// the step of the algorithm to that one. Hence we need to // the step of the algorithm to that one. Hence we need to
// preserve the structure of the original graph: neighbours and // preserve the structure of the original graph: neighbours and
// weights should be grouped by vertex. // weights should be grouped by vertex.
thrust::sort(thrust::make_zip_iterator(thrust::make_tuple(dNewStartpoints, dSurvivedEdgesIDs)), thrust::sort(thrust::make_zip_iterator(dNewStartpoints, dSurvivedEdgesIDs),
thrust::make_zip_iterator( thrust::make_zip_iterator(dNewStartpoints + edgesCount_, dSurvivedEdgesIDs + edgesCount_));
thrust::make_tuple(dNewStartpoints + edgesCount_, dSurvivedEdgesIDs + edgesCount_)));
// Find the group of contracted edges. // Find the group of contracted edges.
uint *invalidEdgesPtr = uint *invalidEdgesPtr =

View File

@ -30,6 +30,9 @@
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/random.h> #include <thrust/random.h>
// for cuda::std::tuple
#include <cuda/std/tuple>
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#include <helper_cuda.h> #include <helper_cuda.h>
@ -610,7 +613,7 @@ struct Random_generator
return a; return a;
} }
__host__ __device__ __forceinline__ thrust::tuple<float, float> operator()() __host__ __device__ __forceinline__ cuda::std::tuple<float, float> operator()()
{ {
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
unsigned seed = hash(blockIdx.x * blockDim.x + threadIdx.x + count); unsigned seed = hash(blockIdx.x * blockDim.x + threadIdx.x + count);
@ -622,7 +625,7 @@ struct Random_generator
#endif #endif
thrust::default_random_engine rng(seed); thrust::default_random_engine rng(seed);
thrust::random::uniform_real_distribution<float> distrib; thrust::random::uniform_real_distribution<float> distrib;
return thrust::make_tuple(distrib(rng), distrib(rng)); return cuda::std::make_tuple(distrib(rng), distrib(rng));
} }
}; };
@ -644,9 +647,8 @@ bool cdpQuadtree(int warp_size)
// Generate random points. // Generate random points.
Random_generator rnd; Random_generator rnd;
thrust::generate(thrust::make_zip_iterator(thrust::make_tuple(x_d0.begin(), y_d0.begin())), thrust::generate(
thrust::make_zip_iterator(thrust::make_tuple(x_d0.end(), y_d0.end())), thrust::make_zip_iterator(x_d0.begin(), y_d0.begin()), thrust::make_zip_iterator(x_d0.end(), y_d0.end()), rnd);
rnd);
// Host structures to analyze the device ones. // Host structures to analyze the device ones.
Points points_init[2]; Points points_init[2];

View File

@ -144,8 +144,9 @@ VulkanBaseApp::~VulkanBaseApp()
} }
#ifdef _VK_TIMELINE_SEMAPHORE #ifdef _VK_TIMELINE_SEMAPHORE
if (m_vkPresentationSemaphore != VK_NULL_HANDLE) { for (size_t i = 0; i < m_vkImageAcquiredSemaphores.size(); i++) {
vkDestroySemaphore(m_device, m_vkPresentationSemaphore, nullptr); vkDestroySemaphore(m_device, m_vkImageAcquiredSemaphores[i], nullptr);
vkDestroySemaphore(m_device, m_vkRenderCompleteSemaphores[i], nullptr);
} }
#endif /* _VK_TIMELINE_SEMAPHORE */ #endif /* _VK_TIMELINE_SEMAPHORE */
@ -1351,8 +1352,15 @@ void VulkanBaseApp::createSyncObjects()
} }
#ifdef _VK_TIMELINE_SEMAPHORE #ifdef _VK_TIMELINE_SEMAPHORE
if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_vkPresentationSemaphore) != VK_SUCCESS) { m_vkImageAcquiredSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
throw std::runtime_error("Failed to create binary semaphore!"); m_vkRenderCompleteSemaphores.resize(MAX_FRAMES_IN_FLIGHT);
for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_vkImageAcquiredSemaphores[i]) != VK_SUCCESS) {
throw std::runtime_error("Failed to create image acquired semaphore!");
}
if (vkCreateSemaphore(m_device, &semaphoreInfo, nullptr, &m_vkRenderCompleteSemaphores[i]) != VK_SUCCESS) {
throw std::runtime_error("Failed to create render complete semaphore!");
}
} }
#endif /* _VK_TIMELINE_SEMAPHORE */ #endif /* _VK_TIMELINE_SEMAPHORE */
} }
@ -1634,6 +1642,12 @@ void VulkanBaseApp::drawFrame()
static uint64_t waitValue = 0; static uint64_t waitValue = 0;
static uint64_t signalValue = 1; static uint64_t signalValue = 1;
size_t currentFrameIdx = m_currentFrame % MAX_FRAMES_IN_FLIGHT;
// Wait for this frame's fence to avoid VUID-vkAcquireNextImageKHR-semaphore-01779
// Ensures previous frame's GPU work completes before reusing per-frame semaphores
vkWaitForFences(m_device, 1, &m_inFlightFences[currentFrameIdx], VK_TRUE, std::numeric_limits<uint64_t>::max());
VkSemaphoreWaitInfo semaphoreWaitInfo = {}; VkSemaphoreWaitInfo semaphoreWaitInfo = {};
semaphoreWaitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO; semaphoreWaitInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_WAIT_INFO;
semaphoreWaitInfo.pSemaphores = &m_vkTimelineSemaphore; semaphoreWaitInfo.pSemaphores = &m_vkTimelineSemaphore;
@ -1641,20 +1655,26 @@ void VulkanBaseApp::drawFrame()
semaphoreWaitInfo.pValues = &waitValue; semaphoreWaitInfo.pValues = &waitValue;
vkWaitSemaphores(m_device, &semaphoreWaitInfo, std::numeric_limits<uint64_t>::max()); vkWaitSemaphores(m_device, &semaphoreWaitInfo, std::numeric_limits<uint64_t>::max());
// Use separate binary semaphores for swapchain sync to avoid VUID-01779
// Timeline semaphore (m_vkTimelineSemaphore) is dedicated to Vulkan-CUDA interop
uint32_t imageIndex; uint32_t imageIndex;
VkResult result = vkAcquireNextImageKHR(m_device, VkResult result = vkAcquireNextImageKHR(m_device,
m_swapChain, m_swapChain,
std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max(),
m_vkPresentationSemaphore, m_vkImageAcquiredSemaphores[currentFrameIdx],
VK_NULL_HANDLE, VK_NULL_HANDLE,
&imageIndex); &imageIndex);
if (result == VK_ERROR_OUT_OF_DATE_KHR) { if (result == VK_ERROR_OUT_OF_DATE_KHR) {
recreateSwapChain(); recreateSwapChain();
return;
} }
else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) { else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
throw std::runtime_error("Failed to acquire swap chain image!"); throw std::runtime_error("Failed to acquire swap chain image!");
} }
// Reset the fence for this frame
vkResetFences(m_device, 1, &m_inFlightFences[currentFrameIdx]);
updateUniformBuffer(imageIndex); updateUniformBuffer(imageIndex);
VkSubmitInfo submitInfo = {}; VkSubmitInfo submitInfo = {};
@ -1663,6 +1683,8 @@ void VulkanBaseApp::drawFrame()
std::vector<VkSemaphore> waitSemaphores; std::vector<VkSemaphore> waitSemaphores;
std::vector<VkPipelineStageFlags> waitStages; std::vector<VkPipelineStageFlags> waitStages;
waitSemaphores.push_back(m_vkImageAcquiredSemaphores[currentFrameIdx]);
waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
waitSemaphores.push_back(m_vkTimelineSemaphore); waitSemaphores.push_back(m_vkTimelineSemaphore);
waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT); waitStages.push_back(VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
@ -1675,26 +1697,30 @@ void VulkanBaseApp::drawFrame()
std::vector<VkSemaphore> signalSemaphores; std::vector<VkSemaphore> signalSemaphores;
signalSemaphores.push_back(m_vkTimelineSemaphore); signalSemaphores.push_back(m_vkTimelineSemaphore);
signalSemaphores.push_back(m_vkRenderCompleteSemaphores[currentFrameIdx]);
submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size(); submitInfo.signalSemaphoreCount = (uint32_t)signalSemaphores.size();
submitInfo.pSignalSemaphores = signalSemaphores.data(); submitInfo.pSignalSemaphores = signalSemaphores.data();
VkTimelineSemaphoreSubmitInfo timelineInfo = {}; VkTimelineSemaphoreSubmitInfo timelineInfo = {};
timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO; timelineInfo.sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO;
timelineInfo.waitSemaphoreValueCount = 1; timelineInfo.waitSemaphoreValueCount = 2;
timelineInfo.pWaitSemaphoreValues = &waitValue; uint64_t waitValues[] = {0, waitValue};
timelineInfo.signalSemaphoreValueCount = 1; timelineInfo.pWaitSemaphoreValues = waitValues;
timelineInfo.pSignalSemaphoreValues = &signalValue; timelineInfo.signalSemaphoreValueCount = 2;
uint64_t signalValues[] = {signalValue, 0};
timelineInfo.pSignalSemaphoreValues = signalValues;
submitInfo.pNext = &timelineInfo; submitInfo.pNext = &timelineInfo;
if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, VK_NULL_HANDLE) != VK_SUCCESS) { // Submit with fence to track when this frame completes
if (vkQueueSubmit(m_graphicsQueue, 1, &submitInfo, m_inFlightFences[currentFrameIdx]) != VK_SUCCESS) {
throw std::runtime_error("failed to submit draw command buffer!"); throw std::runtime_error("failed to submit draw command buffer!");
} }
VkPresentInfoKHR presentInfo = {}; VkPresentInfoKHR presentInfo = {};
presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR; presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
presentInfo.waitSemaphoreCount = 1; presentInfo.waitSemaphoreCount = 1;
presentInfo.pWaitSemaphores = &m_vkPresentationSemaphore; presentInfo.pWaitSemaphores = &m_vkRenderCompleteSemaphores[currentFrameIdx];
VkSwapchainKHR swapChains[] = {m_swapChain}; VkSwapchainKHR swapChains[] = {m_swapChain};
presentInfo.swapchainCount = 1; presentInfo.swapchainCount = 1;

View File

@ -107,7 +107,8 @@ protected:
std::vector<VkFence> m_inFlightFences; std::vector<VkFence> m_inFlightFences;
std::vector<VkBuffer> m_uniformBuffers; std::vector<VkBuffer> m_uniformBuffers;
std::vector<VkDeviceMemory> m_uniformMemory; std::vector<VkDeviceMemory> m_uniformMemory;
VkSemaphore m_vkPresentationSemaphore; std::vector<VkSemaphore> m_vkImageAcquiredSemaphores;
std::vector<VkSemaphore> m_vkRenderCompleteSemaphores;
VkSemaphore m_vkTimelineSemaphore; VkSemaphore m_vkTimelineSemaphore;
VkDescriptorSetLayout m_descriptorSetLayout; VkDescriptorSetLayout m_descriptorSetLayout;
VkDescriptorPool m_descriptorPool; VkDescriptorPool m_descriptorPool;

View File

@ -38,7 +38,7 @@
typedef float vec2[2]; typedef float vec2[2];
std::string execution_path; std::string execution_path;
#ifndef NDEBUG #ifdef NDEBUG
#define ENABLE_VALIDATION (false) #define ENABLE_VALIDATION (false)
#else #else
#define ENABLE_VALIDATION (true) #define ENABLE_VALIDATION (true)

View File

@ -44,7 +44,7 @@
#include "helper_multiprocess.h" #include "helper_multiprocess.h"
// #define DEBUG // #define DEBUG
#ifndef DEBUG #ifdef NDEBUG
#define ENABLE_VALIDATION (false) #define ENABLE_VALIDATION (false)
#else #else
#define ENABLE_VALIDATION (true) #define ENABLE_VALIDATION (true)

View File

@ -127,9 +127,9 @@ extern "C"
thrust::device_ptr<float4> d_oldVel(oldVel); thrust::device_ptr<float4> d_oldVel(oldVel);
thrust::for_each( thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(d_newPos, d_newVel, d_oldPos, d_oldVel)), thrust::make_zip_iterator(d_newPos, d_newVel, d_oldPos, d_oldVel),
thrust::make_zip_iterator(thrust::make_tuple( thrust::make_zip_iterator(
d_newPos + numParticles, d_newVel + numParticles, d_oldPos + numParticles, d_oldVel + numParticles)), d_newPos + numParticles, d_newVel + numParticles, d_oldPos + numParticles, d_oldVel + numParticles),
integrate_functor(deltaTime, noiseTex)); integrate_functor(deltaTime, noiseTex));
} }
@ -143,8 +143,8 @@ extern "C"
thrust::device_ptr<float> d_keys(keys); thrust::device_ptr<float> d_keys(keys);
thrust::device_ptr<uint> d_indices(indices); thrust::device_ptr<uint> d_indices(indices);
thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(d_pos, d_keys)), thrust::for_each(thrust::make_zip_iterator(d_pos, d_keys),
thrust::make_zip_iterator(thrust::make_tuple(d_pos + numParticles, d_keys + numParticles)), thrust::make_zip_iterator(d_pos + numParticles, d_keys + numParticles),
calcDepth_functor(sortVector)); calcDepth_functor(sortVector));
thrust::sequence(d_indices, d_indices + numParticles); thrust::sequence(d_indices, d_indices + numParticles);

View File

@ -40,6 +40,9 @@
#include "thrust/iterator/zip_iterator.h" #include "thrust/iterator/zip_iterator.h"
#include "thrust/sort.h" #include "thrust/sort.h"
// for cuda::std::get
#include <cuda/std/utility>
cudaTextureObject_t noiseTex; cudaTextureObject_t noiseTex;
// simulation parameters // simulation parameters
__constant__ SimParams cudaParams; __constant__ SimParams cudaParams;
@ -65,8 +68,8 @@ struct integrate_functor
template <typename Tuple> __device__ void operator()(Tuple t) template <typename Tuple> __device__ void operator()(Tuple t)
{ {
volatile float4 posData = thrust::get<2>(t); volatile float4 posData = cuda::std::get<2>(t);
volatile float4 velData = thrust::get<3>(t); volatile float4 velData = cuda::std::get<3>(t);
float3 pos = make_float3(posData.x, posData.y, posData.z); float3 pos = make_float3(posData.x, posData.y, posData.z);
float3 vel = make_float3(velData.x, velData.y, velData.z); float3 vel = make_float3(velData.x, velData.y, velData.z);
@ -95,8 +98,8 @@ struct integrate_functor
vel *= cudaParams.globalDamping; vel *= cudaParams.globalDamping;
// store new position and velocity // store new position and velocity
thrust::get<0>(t) = make_float4(pos, age); cuda::std::get<0>(t) = make_float4(pos, age);
thrust::get<1>(t) = make_float4(vel, velData.w); cuda::std::get<1>(t) = make_float4(vel, velData.w);
} }
}; };
@ -111,10 +114,10 @@ struct calcDepth_functor
template <typename Tuple> __host__ __device__ void operator()(Tuple t) template <typename Tuple> __host__ __device__ void operator()(Tuple t)
{ {
volatile float4 p = thrust::get<0>(t); volatile float4 p = cuda::std::get<0>(t);
float key = -dot(make_float3(p.x, p.y, p.z), float key = -dot(make_float3(p.x, p.y, p.z),
sortVector); // project onto sort vector sortVector); // project onto sort vector
thrust::get<1>(t) = key; cuda::std::get<1>(t) = key;
} }
}; };

View File

@ -69,9 +69,9 @@ const int MAX_FRAMES = 4;
const std::vector<const char *> validationLayers = {"VK_LAYER_KHRONOS_validation"}; const std::vector<const char *> validationLayers = {"VK_LAYER_KHRONOS_validation"};
#ifdef NDEBUG #ifdef NDEBUG
const bool enableValidationLayers = true;
#else
const bool enableValidationLayers = false; const bool enableValidationLayers = false;
#else
const bool enableValidationLayers = true;
#endif #endif
std::string execution_path; std::string execution_path;