From 770e433a9ec260fe659036a43a5d2673b39ce45b Mon Sep 17 00:00:00 2001 From: Peggy Tian Date: Mon, 12 May 2025 06:04:22 +0000 Subject: [PATCH 1/7] Bug 5056055: limit register usage to 128 per thread in debug mode to comply with the maximum number of 32-bit registers per SM --- .../conjugateGradientMultiBlockCG/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/CMakeLists.txt b/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/CMakeLists.txt index 589059d7..924e640f 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/CMakeLists.txt +++ b/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/CMakeLists.txt @@ -13,6 +13,7 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") if(ENABLE_CUDA_DEBUG) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -maxrregcount=128") # limit register usage to 128 per thread to comply with the maximum number of 32-bit registers per SM else() set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option) endif() From 2ec9cf394a2e3143a6f424e75295922b2db59787 Mon Sep 17 00:00:00 2001 From: shawnz Date: Mon, 12 May 2025 15:00:52 +0800 Subject: [PATCH 2/7] Bug 5272236: Update the include file copy path as path changes on 13.0 --- Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt b/Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt index 2d038774..1e7f6618 100644 --- a/Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt +++ b/Samples/0_Introduction/matrixMul_nvrtc/CMakeLists.txt @@ -45,10 +45,10 @@ add_custom_command(TARGET matrixMul_nvrtc POST_BUILD add_custom_command(TARGET matrixMul_nvrtc POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory - ${CUDAToolkit_BIN_DIR}/../include/nv ${CMAKE_CURRENT_BINARY_DIR}/nv + ${CUDAToolkit_BIN_DIR}/../include/cccl/nv ${CMAKE_CURRENT_BINARY_DIR}/nv ) add_custom_command(TARGET matrixMul_nvrtc POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory - ${CUDAToolkit_BIN_DIR}/../include/cuda ${CMAKE_CURRENT_BINARY_DIR}/cuda + ${CUDAToolkit_BIN_DIR}/../include/cccl/cuda ${CMAKE_CURRENT_BINARY_DIR}/cuda ) From 8f33cc60947b8f3e8ac72d2f3c25c46a96cea583 Mon Sep 17 00:00:00 2001 From: shawnz Date: Mon, 12 May 2025 15:02:31 +0800 Subject: [PATCH 3/7] Bug 5274280: Enable 8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop --- Samples/8_Platform_Specific/Tegra/CMakeLists.txt | 2 +- .../Tegra/EGLSync_CUDAEvent_Interop/CMakeLists.txt | 2 +- .../EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Samples/8_Platform_Specific/Tegra/CMakeLists.txt b/Samples/8_Platform_Specific/Tegra/CMakeLists.txt index c1a65040..356c2a13 100644 --- a/Samples/8_Platform_Specific/Tegra/CMakeLists.txt +++ b/Samples/8_Platform_Specific/Tegra/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(cuDLAHybridMode) add_subdirectory(cuDLALayerwiseStatsHybrid) add_subdirectory(cuDLALayerwiseStatsStandalone) add_subdirectory(cuDLAStandaloneMode) -#add_subdirectory(EGLSync_CUDAEvent_Interop) +add_subdirectory(EGLSync_CUDAEvent_Interop) add_subdirectory(fluidsGLES) add_subdirectory(nbody_opengles) add_subdirectory(simpleGLES) diff --git a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/CMakeLists.txt b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/CMakeLists.txt index 90ad2f3b..b75b8356 100644 --- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/CMakeLists.txt +++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/CMakeLists.txt @@ -17,7 +17,7 @@ else() endif() # Include directories and libraries -include_directories(../../../Common) +include_directories(../../../../Common) find_package(EGL) find_package(X11) diff --git a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu index 1c459c5c..9e709a16 100644 --- a/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu +++ b/Samples/8_Platform_Specific/Tegra/EGLSync_CUDAEvent_Interop/EGLSync_CUDAEvent_Interop.cu @@ -259,7 +259,8 @@ void checkSync(int argc, char **argv) unsigned char *pSurf_read = NULL, *pSurf_write = NULL; int integrated; - CUresult status = CUDA_SUCCESS; + CUresult status = CUDA_SUCCESS; + CUctxCreateParams ctxCreateParams = {}; // Init values for variables x = y = 0; @@ -269,7 +270,7 @@ void checkSync(int argc, char **argv) } device = findCudaDeviceDRV(argc, (const char **)argv); - if (CUDA_SUCCESS != (status = cuCtxCreate(&context, 0, device))) { + if (CUDA_SUCCESS != (status = cuCtxCreate(&context, &ctxCreateParams, 0, device))) { printf("failed to create CUDA context\n"); } cuCtxPushCurrent(context); From c6208f58971818fbdb8f2f933a8a5e3973a3679a Mon Sep 17 00:00:00 2001 From: shawnz Date: Mon, 12 May 2025 15:39:08 +0800 Subject: [PATCH 4/7] Bug 5263330: Update CUFFT errors as per latest changes on CUDA 13.0 --- Common/helper_cuda.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Common/helper_cuda.h b/Common/helper_cuda.h index dc0efc46..401c41b2 100644 --- a/Common/helper_cuda.h +++ b/Common/helper_cuda.h @@ -138,26 +138,26 @@ static const char *_cudaGetErrorEnum(cufftResult error) { case CUFFT_UNALIGNED_DATA: return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: return "CUFFT_NO_WORKSPACE"; case CUFFT_NOT_IMPLEMENTED: return "CUFFT_NOT_IMPLEMENTED"; - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; + case CUFFT_MISSING_DEPENDENCY: + return "CUFFT_MISSING_DEPENDENCY"; - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; + case CUFFT_NVRTC_FAILURE: + return "CUFFT_NVRTC_FAILURE"; + + case CUFFT_NVJITLINK_FAILURE: + return "CUFFT_NVJITLINK_FAILURE"; + + case CUFFT_NVSHMEM_FAILURE: + return "CUFFT_NVSHMEM_FAILURE"; } return ""; From 107f3f537fc73dae22608b3d4a9c486210ad3e9e Mon Sep 17 00:00:00 2001 From: shawnz Date: Mon, 19 May 2025 17:38:22 +0800 Subject: [PATCH 5/7] Update the include files sequence for vulkan samples on Windows --- .../simpleVulkan/VulkanBaseApp.h | 22 ++++++++++--------- .../simpleVulkanMMAP/VulkanBaseApp.h | 4 +++- .../vulkanImageCUDA/vulkanImageCUDA.cu | 4 +++- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h index efb5fac4..616b1c14 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h +++ b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h @@ -34,8 +34,10 @@ #include #ifdef _WIN64 #define NOMINMAX -#include +// Add windows.h to the include path firstly as dependency for other Windows headers #include +// Add other Windows headers +#include #endif /* _WIN64 */ /* remove _VK_TIMELINE_SEMAPHORE to use binary semaphores */ @@ -54,6 +56,7 @@ public: void init(); void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType); void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); + bool isVkPhysicalDeviceUuid(void *Uuid); void createExternalSemaphore(VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, @@ -85,6 +88,7 @@ protected: VkDebugUtilsMessengerEXT m_debugMessenger; VkSurfaceKHR m_surface; VkPhysicalDevice m_physicalDevice; + uint8_t m_deviceUUID[VK_UUID_SIZE]; VkDevice m_device; VkQueue m_graphicsQueue; VkQueue m_presentQueue; @@ -105,17 +109,15 @@ protected: std::vector m_inFlightFences; std::vector m_uniformBuffers; std::vector m_uniformMemory; - VkSemaphore m_vkPresentationSemaphore; - VkSemaphore m_vkTimelineSemaphore; VkDescriptorSetLayout m_descriptorSetLayout; VkDescriptorPool m_descriptorPool; std::vector m_descriptorSets; - VkImage m_depthImage; - VkDeviceMemory m_depthImageMemory; - VkImageView m_depthImageView; - size_t m_currentFrame; - bool m_framebufferResized; - uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; + + VkImage m_depthImage; + VkDeviceMemory m_depthImageMemory; + VkImageView m_depthImageView; + size_t m_currentFrame; + bool m_framebufferResized; virtual void initVulkanApp() {} virtual void fillRenderingCommandBuffer(VkCommandBuffer &buffer) {} @@ -128,7 +130,7 @@ protected: std::vector &waitStages) const; virtual void getSignalFrameSemaphores(std::vector &signal) const; virtual VkDeviceSize getUniformSize() const; - virtual void updateUniformBuffer(uint32_t imageIndex); + virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame); virtual void drawFrame(); private: diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h index 814e321e..4f4425bd 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/VulkanBaseApp.h @@ -34,8 +34,10 @@ #include #ifdef _WIN64 #define NOMINMAX -#include +// Add windows.h to the include path firstly as dependency for other Windows headers #include +// Add other Windows headers +#include #endif /* _WIN64 */ struct GLFWwindow; diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu b/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu index f782bbb9..b94487fc 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/vulkanImageCUDA.cu @@ -27,10 +27,12 @@ #define GLFW_INCLUDE_VULKAN #ifdef _WIN64 +// Add windows.h to the include path firstly as dependency for other Windows headers +#include +// Add other Windows headers #include #include #include -#include #define _USE_MATH_DEFINES #endif From 5987a9e9fa5197079ae38117fe7533a199e25f76 Mon Sep 17 00:00:00 2001 From: shawnz Date: Mon, 19 May 2025 17:38:42 +0800 Subject: [PATCH 6/7] Update transpose for code format check --- Samples/6_Performance/transpose/transpose.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Samples/6_Performance/transpose/transpose.cu b/Samples/6_Performance/transpose/transpose.cu index 8a1d8bec..09c7582e 100644 --- a/Samples/6_Performance/transpose/transpose.cu +++ b/Samples/6_Performance/transpose/transpose.cu @@ -597,17 +597,16 @@ int main(int argc, char **argv) 1, TILE_DIM * BLOCK_ROWS); - // Reset d_odata to zero before starting the next loop iteration to avoid - // carrying over results from previous kernels. Without this reset, residual - // data from a prior kernel (e.g., 'copy') could make a subsequent - // kernel (e.g., 'copySharedMem') appear correct even if it performs no work, + // Reset d_odata to zero before starting the next loop iteration to avoid + // carrying over results from previous kernels. Without this reset, residual + // data from a prior kernel (e.g., 'copy') could make a subsequent + // kernel (e.g., 'copySharedMem') appear correct even if it performs no work, // leading to false positives in compareData. for (int i = 0; i < (size_x * size_y); ++i) { h_odata[i] = 0; } // copy host data to device - checkCudaErrors( - cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_odata, h_odata, mem_size, cudaMemcpyHostToDevice)); } // cleanup From da3b7a2b3cecd5f73a19bcc7fa502169b72c90bf Mon Sep 17 00:00:00 2001 From: shawnz Date: Mon, 19 May 2025 17:43:08 +0800 Subject: [PATCH 7/7] Update the vulkanImageCUDA/vulkanImageCUDA.cu for Windows headers --- .../simpleVulkan/VulkanBaseApp.h | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h index 616b1c14..7354881d 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h +++ b/Samples/5_Domain_Specific/simpleVulkan/VulkanBaseApp.h @@ -34,9 +34,9 @@ #include #ifdef _WIN64 #define NOMINMAX -// Add windows.h to the include path firstly as dependency for other Windows headers +// Add windows.h to the include path #include -// Add other Windows headers +// Add vulkan_win32.h to the include path #include #endif /* _WIN64 */ @@ -56,7 +56,6 @@ public: void init(); void *getMemHandle(VkDeviceMemory memory, VkExternalMemoryHandleTypeFlagBits handleType); void *getSemaphoreHandle(VkSemaphore semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); - bool isVkPhysicalDeviceUuid(void *Uuid); void createExternalSemaphore(VkSemaphore &semaphore, VkExternalSemaphoreHandleTypeFlagBits handleType); void createBuffer(VkDeviceSize size, VkBufferUsageFlags usage, @@ -88,7 +87,6 @@ protected: VkDebugUtilsMessengerEXT m_debugMessenger; VkSurfaceKHR m_surface; VkPhysicalDevice m_physicalDevice; - uint8_t m_deviceUUID[VK_UUID_SIZE]; VkDevice m_device; VkQueue m_graphicsQueue; VkQueue m_presentQueue; @@ -109,15 +107,17 @@ protected: std::vector m_inFlightFences; std::vector m_uniformBuffers; std::vector m_uniformMemory; + VkSemaphore m_vkPresentationSemaphore; + VkSemaphore m_vkTimelineSemaphore; VkDescriptorSetLayout m_descriptorSetLayout; VkDescriptorPool m_descriptorPool; std::vector m_descriptorSets; - - VkImage m_depthImage; - VkDeviceMemory m_depthImageMemory; - VkImageView m_depthImageView; - size_t m_currentFrame; - bool m_framebufferResized; + VkImage m_depthImage; + VkDeviceMemory m_depthImageMemory; + VkImageView m_depthImageView; + size_t m_currentFrame; + bool m_framebufferResized; + uint8_t m_vkDeviceUUID[VK_UUID_SIZE]; virtual void initVulkanApp() {} virtual void fillRenderingCommandBuffer(VkCommandBuffer &buffer) {} @@ -130,7 +130,7 @@ protected: std::vector &waitStages) const; virtual void getSignalFrameSemaphores(std::vector &signal) const; virtual VkDeviceSize getUniformSize() const; - virtual void updateUniformBuffer(uint32_t imageIndex, size_t globalFrame); + virtual void updateUniformBuffer(uint32_t imageIndex); virtual void drawFrame(); private: