diff --git a/CHANGELOG.md b/CHANGELOG.md index d99370f0..ad1e792e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## Changelog +### CUDA 12.3 +* Added cuDLA samples +* Fixed jitLto regression + ### CUDA 12.2 * libNVVM samples received updates * Fixed jitLto Case issues diff --git a/README.md b/README.md index 113a4c61..5f695c45 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # CUDA Samples -Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads). +Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads). ## Release Notes This section describes the release notes for the CUDA Samples on GitHub only. -### CUDA 12.2 +### CUDA 12.3 ### [older versions...](./CHANGELOG.md) @@ -14,7 +14,7 @@ This section describes the release notes for the CUDA Samples on GitHub only. ### Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). ### Getting the CUDA Samples diff --git a/Samples/0_Introduction/UnifiedMemoryStreams/README.md b/Samples/0_Introduction/UnifiedMemoryStreams/README.md index 0587b046..699c248e 100644 --- a/Samples/0_Introduction/UnifiedMemoryStreams/README.md +++ b/Samples/0_Introduction/UnifiedMemoryStreams/README.md @@ -28,7 +28,7 @@ cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSe ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/asyncAPI/README.md b/Samples/0_Introduction/asyncAPI/README.md index 7583b4ac..2a194687 100644 --- a/Samples/0_Introduction/asyncAPI/README.md +++ b/Samples/0_Introduction/asyncAPI/README.md @@ -27,7 +27,7 @@ cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaPro ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/c++11_cuda/README.md b/Samples/0_Introduction/c++11_cuda/README.md index c9b44a58..4dd88ece 100644 --- a/Samples/0_Introduction/c++11_cuda/README.md +++ b/Samples/0_Introduction/c++11_cuda/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaMemcpy, cudaMemset, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/clock/README.md b/Samples/0_Introduction/clock/README.md index e38280f2..2ff6c773 100644 --- a/Samples/0_Introduction/clock/README.md +++ b/Samples/0_Introduction/clock/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/clock_nvrtc/README.md b/Samples/0_Introduction/clock_nvrtc/README.md index 452759f1..d0bb07a4 100644 --- a/Samples/0_Introduction/clock_nvrtc/README.md +++ b/Samples/0_Introduction/clock_nvrtc/README.md @@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/concurrentKernels/README.md b/Samples/0_Introduction/concurrentKernels/README.md index ebe2ac4d..75d37a89 100644 --- a/Samples/0_Introduction/concurrentKernels/README.md +++ b/Samples/0_Introduction/concurrentKernels/README.md @@ -27,7 +27,7 @@ cudaStreamDestroy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaEv ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/cppIntegration/README.md b/Samples/0_Introduction/cppIntegration/README.md index f278cf4f..b33e82df 100644 --- a/Samples/0_Introduction/cppIntegration/README.md +++ b/Samples/0_Introduction/cppIntegration/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/cppOverload/README.md b/Samples/0_Introduction/cppOverload/README.md index 9cdeac8c..664b386e 100644 --- a/Samples/0_Introduction/cppOverload/README.md +++ b/Samples/0_Introduction/cppOverload/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFuncSetCacheConfig, cudaFree, cudaMallocHost, cudaSetDevice, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/cudaOpenMP/README.md b/Samples/0_Introduction/cudaOpenMP/README.md index 88baf775..49032142 100644 --- a/Samples/0_Introduction/cudaOpenMP/README.md +++ b/Samples/0_Introduction/cudaOpenMP/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaGetLastError, cudaSetDevice, cudaG ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/fp16ScalarProduct/README.md b/Samples/0_Introduction/fp16ScalarProduct/README.md index 39f28774..aa0d4115 100644 --- a/Samples/0_Introduction/fp16ScalarProduct/README.md +++ b/Samples/0_Introduction/fp16ScalarProduct/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaMallocHost, cudaFreeHost, cudaMalloc, cudaGetDevicePro ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/matrixMul/README.md b/Samples/0_Introduction/matrixMul/README.md index 1c7d3f36..2cc38b13 100644 --- a/Samples/0_Introduction/matrixMul/README.md +++ b/Samples/0_Introduction/matrixMul/README.md @@ -27,7 +27,7 @@ cudaStreamCreateWithFlags, cudaProfilerStop, cudaMalloc, cudaFree, cudaMallocHos ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/matrixMulDrv/README.md b/Samples/0_Introduction/matrixMulDrv/README.md index bb9723de..f31274cc 100644 --- a/Samples/0_Introduction/matrixMulDrv/README.md +++ b/Samples/0_Introduction/matrixMulDrv/README.md @@ -27,7 +27,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuDeviceGetName, cuDeviceTotalMem, c ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/matrixMulDynlinkJIT/README.md b/Samples/0_Introduction/matrixMulDynlinkJIT/README.md index d8691a83..997bd616 100644 --- a/Samples/0_Introduction/matrixMulDynlinkJIT/README.md +++ b/Samples/0_Introduction/matrixMulDynlinkJIT/README.md @@ -27,7 +27,7 @@ cuMemcpyDtoH, cuDeviceGetName, cuParamSeti, cuModuleLoadDataEx, cuModuleGetFunct ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/matrixMul_nvrtc/README.md b/Samples/0_Introduction/matrixMul_nvrtc/README.md index 5b92b521..be6ce1e7 100644 --- a/Samples/0_Introduction/matrixMul_nvrtc/README.md +++ b/Samples/0_Introduction/matrixMul_nvrtc/README.md @@ -30,7 +30,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemF ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/mergeSort/README.md b/Samples/0_Introduction/mergeSort/README.md index 0c5ac3ba..06f53c69 100644 --- a/Samples/0_Introduction/mergeSort/README.md +++ b/Samples/0_Introduction/mergeSort/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleAWBarrier/README.md b/Samples/0_Introduction/simpleAWBarrier/README.md index b13f6ee1..5574a0d0 100644 --- a/Samples/0_Introduction/simpleAWBarrier/README.md +++ b/Samples/0_Introduction/simpleAWBarrier/README.md @@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaDeviceGetAttribute, cudaMallocHost, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleAssert/README.md b/Samples/0_Introduction/simpleAssert/README.md index 18e82da1..904479a4 100644 --- a/Samples/0_Introduction/simpleAssert/README.md +++ b/Samples/0_Introduction/simpleAssert/README.md @@ -27,7 +27,7 @@ cudaDeviceSynchronize, cudaGetErrorString ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleAssert_nvrtc/README.md b/Samples/0_Introduction/simpleAssert_nvrtc/README.md index ff7b1ed4..f3bde45f 100644 --- a/Samples/0_Introduction/simpleAssert_nvrtc/README.md +++ b/Samples/0_Introduction/simpleAssert_nvrtc/README.md @@ -30,7 +30,7 @@ cuModuleGetFunction, cuLaunchKernel, cuCtxSynchronize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics/README.md b/Samples/0_Introduction/simpleAtomicIntrinsics/README.md index a19b0f0a..764147f2 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics/README.md +++ b/Samples/0_Introduction/simpleAtomicIntrinsics/README.md @@ -27,7 +27,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/README.md b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/README.md index d4f1fc48..445feccc 100644 --- a/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/README.md +++ b/Samples/0_Introduction/simpleAtomicIntrinsics_nvrtc/README.md @@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleAttributes/README.md b/Samples/0_Introduction/simpleAttributes/README.md index bae0f405..0e2d1ecd 100644 --- a/Samples/0_Introduction/simpleAttributes/README.md +++ b/Samples/0_Introduction/simpleAttributes/README.md @@ -27,7 +27,7 @@ cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSynchronize, cudaStreamSetAttr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleCUDA2GL/README.md b/Samples/0_Introduction/simpleCUDA2GL/README.md index d45183b3..3c72c058 100644 --- a/Samples/0_Introduction/simpleCUDA2GL/README.md +++ b/Samples/0_Introduction/simpleCUDA2GL/README.md @@ -30,7 +30,7 @@ cudaHostAlloc, cudaGraphicsUnmapResources, cudaMalloc, cudaFree, cudaGraphicsRes ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleCallback/README.md b/Samples/0_Introduction/simpleCallback/README.md index f4c799e7..874b74b6 100644 --- a/Samples/0_Introduction/simpleCallback/README.md +++ b/Samples/0_Introduction/simpleCallback/README.md @@ -27,7 +27,7 @@ cudaHostAlloc, cudaStreamDestroy, cudaFree, cudaSetDevice, cudaGetDeviceCount, c ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleCooperativeGroups/README.md b/Samples/0_Introduction/simpleCooperativeGroups/README.md index 222d5fc6..d6b15dcc 100644 --- a/Samples/0_Introduction/simpleCooperativeGroups/README.md +++ b/Samples/0_Introduction/simpleCooperativeGroups/README.md @@ -27,7 +27,7 @@ cudaDeviceSynchronize, cudaGetErrorString ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleCubemapTexture/README.md b/Samples/0_Introduction/simpleCubemapTexture/README.md index be7863dc..bb8123ea 100644 --- a/Samples/0_Introduction/simpleCubemapTexture/README.md +++ b/Samples/0_Introduction/simpleCubemapTexture/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaCreateChannelDesc, cudaFreeArray, cudaFree, cudaPitchedPtr, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleDrvRuntime/README.md b/Samples/0_Introduction/simpleDrvRuntime/README.md index 31b87134..a4b9ba5e 100644 --- a/Samples/0_Introduction/simpleDrvRuntime/README.md +++ b/Samples/0_Introduction/simpleDrvRuntime/README.md @@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleHyperQ/README.md b/Samples/0_Introduction/simpleHyperQ/README.md index 9381aee3..dfcad42d 100644 --- a/Samples/0_Introduction/simpleHyperQ/README.md +++ b/Samples/0_Introduction/simpleHyperQ/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaStreamDestroy, cudaMalloc, cudaFree, cudaMallocHost, cudaEventSy ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleIPC/README.md b/Samples/0_Introduction/simpleIPC/README.md index 338daf85..8d9218d4 100644 --- a/Samples/0_Introduction/simpleIPC/README.md +++ b/Samples/0_Introduction/simpleIPC/README.md @@ -30,7 +30,7 @@ cudaSetDevice, cudaIpcCloseMemHandle, cudaEventDestroy, cudaGetDeviceCount, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleLayeredTexture/README.md b/Samples/0_Introduction/simpleLayeredTexture/README.md index c6312996..3034c1f8 100644 --- a/Samples/0_Introduction/simpleLayeredTexture/README.md +++ b/Samples/0_Introduction/simpleLayeredTexture/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaCreateChannelDesc, cudaFreeArray, cudaFree, cudaPitchedPtr, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleMPI/README.md b/Samples/0_Introduction/simpleMPI/README.md index f4eb9a72..4bbc7b1c 100644 --- a/Samples/0_Introduction/simpleMPI/README.md +++ b/Samples/0_Introduction/simpleMPI/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaGetLastError, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleMultiCopy/README.md b/Samples/0_Introduction/simpleMultiCopy/README.md index cd882f33..868803f2 100644 --- a/Samples/0_Introduction/simpleMultiCopy/README.md +++ b/Samples/0_Introduction/simpleMultiCopy/README.md @@ -27,7 +27,7 @@ cudaHostAlloc, cudaStreamDestroy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaSet ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleMultiGPU/README.md b/Samples/0_Introduction/simpleMultiGPU/README.md index 533b342d..4da03418 100644 --- a/Samples/0_Introduction/simpleMultiGPU/README.md +++ b/Samples/0_Introduction/simpleMultiGPU/README.md @@ -27,7 +27,7 @@ cudaStreamDestroy, cudaFree, cudaMallocHost, cudaSetDevice, cudaFreeHost, cudaSt ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleOccupancy/README.md b/Samples/0_Introduction/simpleOccupancy/README.md index f090060c..c65b663d 100644 --- a/Samples/0_Introduction/simpleOccupancy/README.md +++ b/Samples/0_Introduction/simpleOccupancy/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaEventRecord, cudaGetDevice, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleP2P/README.md b/Samples/0_Introduction/simpleP2P/README.md index 78ae01f6..a2cab261 100644 --- a/Samples/0_Introduction/simpleP2P/README.md +++ b/Samples/0_Introduction/simpleP2P/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaMalloc, cudaFree, cudaMallocHost, cudaEventCreateWithFlags, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simplePitchLinearTexture/README.md b/Samples/0_Introduction/simplePitchLinearTexture/README.md index 27e67e2a..bf1dcc02 100644 --- a/Samples/0_Introduction/simplePitchLinearTexture/README.md +++ b/Samples/0_Introduction/simplePitchLinearTexture/README.md @@ -27,7 +27,7 @@ cudaMallocArray, cudaFreeArray, cudaFree, cudaMallocPitch, cudaDestroyTextureObj ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simplePrintf/README.md b/Samples/0_Introduction/simplePrintf/README.md index b3552f8f..56067d0f 100644 --- a/Samples/0_Introduction/simplePrintf/README.md +++ b/Samples/0_Introduction/simplePrintf/README.md @@ -27,7 +27,7 @@ cudaGetDeviceProperties, cudaDeviceSynchronize, cudaGetDevice ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleSeparateCompilation/README.md b/Samples/0_Introduction/simpleSeparateCompilation/README.md index 73f148bc..e8ffd055 100644 --- a/Samples/0_Introduction/simpleSeparateCompilation/README.md +++ b/Samples/0_Introduction/simpleSeparateCompilation/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaMemcpyFromSymbol, cudaFree, cudaGetLastError, cudaMalloc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleStreams/README.md b/Samples/0_Introduction/simpleStreams/README.md index a5cc0839..8591eec8 100644 --- a/Samples/0_Introduction/simpleStreams/README.md +++ b/Samples/0_Introduction/simpleStreams/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaSetDeviceFlags, cudaSetDevice, cudaEventDestroy, cudaStreamCreat ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleSurfaceWrite/README.md b/Samples/0_Introduction/simpleSurfaceWrite/README.md index 3215e902..ac1008dc 100644 --- a/Samples/0_Introduction/simpleSurfaceWrite/README.md +++ b/Samples/0_Introduction/simpleSurfaceWrite/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaCreateChannelDesc, cudaMallocArray, cudaFreeArray, cudaFree, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleTemplates/README.md b/Samples/0_Introduction/simpleTemplates/README.md index 3b6db2d1..60846a99 100644 --- a/Samples/0_Introduction/simpleTemplates/README.md +++ b/Samples/0_Introduction/simpleTemplates/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaMemcpy, cudaGetDeviceProperties, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleTemplates_nvrtc/README.md b/Samples/0_Introduction/simpleTemplates_nvrtc/README.md index 3ae52e22..988fec69 100644 --- a/Samples/0_Introduction/simpleTemplates_nvrtc/README.md +++ b/Samples/0_Introduction/simpleTemplates_nvrtc/README.md @@ -30,7 +30,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemF ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleTexture/README.md b/Samples/0_Introduction/simpleTexture/README.md index 54a6de54..e51b70ef 100644 --- a/Samples/0_Introduction/simpleTexture/README.md +++ b/Samples/0_Introduction/simpleTexture/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaCreateChannelDesc, cudaMallocArray, cudaFreeArray, cudaFree, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleTexture3D/README.md b/Samples/0_Introduction/simpleTexture3D/README.md index d1c37e11..9b33307e 100644 --- a/Samples/0_Introduction/simpleTexture3D/README.md +++ b/Samples/0_Introduction/simpleTexture3D/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFreeArray, cudaFree, cudaPitchedPtr, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleTextureDrv/README.md b/Samples/0_Introduction/simpleTextureDrv/README.md index 2ecd5e9f..6dace0ff 100644 --- a/Samples/0_Introduction/simpleTextureDrv/README.md +++ b/Samples/0_Introduction/simpleTextureDrv/README.md @@ -27,7 +27,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuModuleLoadData, cuDeviceGetName, cuDeviceGetAttr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleVoteIntrinsics/README.md b/Samples/0_Introduction/simpleVoteIntrinsics/README.md index a258a693..9a4625fc 100644 --- a/Samples/0_Introduction/simpleVoteIntrinsics/README.md +++ b/Samples/0_Introduction/simpleVoteIntrinsics/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMalloc, cudaGetDeviceProperties ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/simpleVoteIntrinsics_nvrtc/README.md b/Samples/0_Introduction/simpleVoteIntrinsics_nvrtc/README.md index 665de800..967d89d4 100644 --- a/Samples/0_Introduction/simpleVoteIntrinsics_nvrtc/README.md +++ b/Samples/0_Introduction/simpleVoteIntrinsics_nvrtc/README.md @@ -30,7 +30,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemF ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/simpleZeroCopy/README.md b/Samples/0_Introduction/simpleZeroCopy/README.md index cc1e2a5b..44a5bdb4 100644 --- a/Samples/0_Introduction/simpleZeroCopy/README.md +++ b/Samples/0_Introduction/simpleZeroCopy/README.md @@ -27,7 +27,7 @@ cudaHostAlloc, cudaSetDeviceFlags, cudaHostRegister, cudaSetDevice, cudaGetDevic ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/systemWideAtomics/README.md b/Samples/0_Introduction/systemWideAtomics/README.md index 16adccf1..9074550c 100644 --- a/Samples/0_Introduction/systemWideAtomics/README.md +++ b/Samples/0_Introduction/systemWideAtomics/README.md @@ -30,7 +30,7 @@ cudaDeviceSynchronize, cudaMallocManaged, cudaGetDeviceProperties, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/0_Introduction/template/README.md b/Samples/0_Introduction/template/README.md index 7c283ec3..568c15e2 100644 --- a/Samples/0_Introduction/template/README.md +++ b/Samples/0_Introduction/template/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/vectorAdd/README.md b/Samples/0_Introduction/vectorAdd/README.md index 4586cf59..f56728cf 100644 --- a/Samples/0_Introduction/vectorAdd/README.md +++ b/Samples/0_Introduction/vectorAdd/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaGetLastError, cudaMalloc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/vectorAddDrv/README.md b/Samples/0_Introduction/vectorAddDrv/README.md index a12cbe4e..873ad190 100644 --- a/Samples/0_Introduction/vectorAddDrv/README.md +++ b/Samples/0_Introduction/vectorAddDrv/README.md @@ -27,7 +27,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuModuleLoadData, cuCtxSynchronize, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/vectorAddMMAP/README.md b/Samples/0_Introduction/vectorAddMMAP/README.md index dc791cfd..75d321fa 100644 --- a/Samples/0_Introduction/vectorAddMMAP/README.md +++ b/Samples/0_Introduction/vectorAddMMAP/README.md @@ -27,7 +27,7 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/0_Introduction/vectorAdd_nvrtc/README.md b/Samples/0_Introduction/vectorAdd_nvrtc/README.md index 2e315e1e..d793c388 100644 --- a/Samples/0_Introduction/vectorAdd_nvrtc/README.md +++ b/Samples/0_Introduction/vectorAdd_nvrtc/README.md @@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/1_Utilities/bandwidthTest/README.md b/Samples/1_Utilities/bandwidthTest/README.md index d66d37a0..f51f6e13 100644 --- a/Samples/1_Utilities/bandwidthTest/README.md +++ b/Samples/1_Utilities/bandwidthTest/README.md @@ -27,7 +27,7 @@ cudaHostAlloc, cudaMemcpy, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaGetErrorSt ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/1_Utilities/deviceQuery/README.md b/Samples/1_Utilities/deviceQuery/README.md index 0da81ee8..ee094164 100644 --- a/Samples/1_Utilities/deviceQuery/README.md +++ b/Samples/1_Utilities/deviceQuery/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaGetErrorString, cudaDeviceCanAccessPeer, cudaSetDevic ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/1_Utilities/deviceQueryDrv/README.md b/Samples/1_Utilities/deviceQueryDrv/README.md index 8a411a63..371091ae 100644 --- a/Samples/1_Utilities/deviceQueryDrv/README.md +++ b/Samples/1_Utilities/deviceQueryDrv/README.md @@ -30,7 +30,7 @@ cudaSetDevice ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/1_Utilities/topologyQuery/README.md b/Samples/1_Utilities/topologyQuery/README.md index 43483e3f..564d2d4f 100644 --- a/Samples/1_Utilities/topologyQuery/README.md +++ b/Samples/1_Utilities/topologyQuery/README.md @@ -27,7 +27,7 @@ cudaGetDeviceCount, cudaDeviceGetAttribute ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md index 21839502..95421550 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_CrossGPU/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaMalloc, cudaProducerPresentFrame, cudaFree, cudaGetErrorString, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md index f8b1fa62..9a0c78b0 100644 --- a/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md +++ b/Samples/2_Concepts_and_Techniques/EGLStream_CUDA_Interop/README.md @@ -33,7 +33,7 @@ cudaProducerReadYUVFrame, cudaProducerTest, cudaProducerDeinit, cudaDeviceCreate ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop/README.md b/Samples/2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop/README.md index d4f5f10f..af7b11b9 100644 --- a/Samples/2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop/README.md +++ b/Samples/2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaDeviceSynchronize, cudaGetValueMis ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md b/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md index e632f2e0..330810da 100644 --- a/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md +++ b/Samples/2_Concepts_and_Techniques/FunctionPointers/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/README.md b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/README.md index ff411aa6..2d9a3208 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/README.md +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineP/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaSetDevice, cudaGetDeviceCount, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/README.md b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/README.md index 68b78466..574f3422 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/README.md +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiInlineQ/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaSetDevice, cudaGetDeviceCount, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/README.md b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/README.md index 406d0fe0..322059cc 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/README.md +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiP/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaSetDevice, cudaGetDeviceCount, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/README.md b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/README.md index 91c957bb..c4c9e282 100644 --- a/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/README.md +++ b/Samples/2_Concepts_and_Techniques/MC_EstimatePiQ/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaSetDevice, cudaGetDeviceCount, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/README.md b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/README.md index 2c4a603b..f80e77b1 100644 --- a/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/README.md +++ b/Samples/2_Concepts_and_Techniques/MC_SingleAsianOptionP/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaSetDevice, cudaGetDeviceCount, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/boxFilter/README.md b/Samples/2_Concepts_and_Techniques/boxFilter/README.md index 24981864..dfdb5eb8 100644 --- a/Samples/2_Concepts_and_Techniques/boxFilter/README.md +++ b/Samples/2_Concepts_and_Techniques/boxFilter/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaCreateChannelDesc, cudaMallocArray, cudaFreeArra ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/convolutionSeparable/README.md b/Samples/2_Concepts_and_Techniques/convolutionSeparable/README.md index ec7ca787..de56e596 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionSeparable/README.md +++ b/Samples/2_Concepts_and_Techniques/convolutionSeparable/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMemcpyToSymbol, cudaMalloc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/convolutionTexture/README.md b/Samples/2_Concepts_and_Techniques/convolutionTexture/README.md index 21339d95..a8bb545c 100644 --- a/Samples/2_Concepts_and_Techniques/convolutionTexture/README.md +++ b/Samples/2_Concepts_and_Techniques/convolutionTexture/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree, cudaMemcpyToArray, cudaDev ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/cuHook/README.md b/Samples/2_Concepts_and_Techniques/cuHook/README.md index 4702e3f6..2f172510 100644 --- a/Samples/2_Concepts_and_Techniques/cuHook/README.md +++ b/Samples/2_Concepts_and_Techniques/cuHook/README.md @@ -32,7 +32,7 @@ cudaDeviceReset, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/dct8x8/README.md b/Samples/2_Concepts_and_Techniques/dct8x8/README.md index 46afcc9d..9e4018b7 100644 --- a/Samples/2_Concepts_and_Techniques/dct8x8/README.md +++ b/Samples/2_Concepts_and_Techniques/dct8x8/README.md @@ -27,7 +27,7 @@ cudaMallocArray, cudaFreeArray, cudaFree, cudaMallocPitch, cudaDestroyTextureObj ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/eigenvalues/README.md b/Samples/2_Concepts_and_Techniques/eigenvalues/README.md index 75de7c33..3683964f 100644 --- a/Samples/2_Concepts_and_Techniques/eigenvalues/README.md +++ b/Samples/2_Concepts_and_Techniques/eigenvalues/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/histogram/README.md b/Samples/2_Concepts_and_Techniques/histogram/README.md index 3f707898..36826ee4 100644 --- a/Samples/2_Concepts_and_Techniques/histogram/README.md +++ b/Samples/2_Concepts_and_Techniques/histogram/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMalloc, cudaGetDeviceProperties ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/imageDenoising/README.md b/Samples/2_Concepts_and_Techniques/imageDenoising/README.md index 43622fb7..af6ba1d6 100644 --- a/Samples/2_Concepts_and_Techniques/imageDenoising/README.md +++ b/Samples/2_Concepts_and_Techniques/imageDenoising/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/inlinePTX/README.md b/Samples/2_Concepts_and_Techniques/inlinePTX/README.md index 4e74a70d..d5c00ca6 100644 --- a/Samples/2_Concepts_and_Techniques/inlinePTX/README.md +++ b/Samples/2_Concepts_and_Techniques/inlinePTX/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaMallocHost, cudaGetLastError, cudaGridSize, cudaBlockS ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/README.md b/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/README.md index 97b19b5e..3537f47b 100644 --- a/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/README.md +++ b/Samples/2_Concepts_and_Techniques/inlinePTX_nvrtc/README.md @@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/interval/README.md b/Samples/2_Concepts_and_Techniques/interval/README.md index 82f98123..7675b3c7 100644 --- a/Samples/2_Concepts_and_Techniques/interval/README.md +++ b/Samples/2_Concepts_and_Techniques/interval/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFuncSetCacheConfig, cudaMalloc, cudaFree, cudaGetLastError, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/particles/README.md b/Samples/2_Concepts_and_Techniques/particles/README.md index 517cca76..4f5ce801 100644 --- a/Samples/2_Concepts_and_Techniques/particles/README.md +++ b/Samples/2_Concepts_and_Techniques/particles/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGraphicsResourceGetMappedP ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/radixSortThrust/README.md b/Samples/2_Concepts_and_Techniques/radixSortThrust/README.md index 3fb8dace..e76a9051 100644 --- a/Samples/2_Concepts_and_Techniques/radixSortThrust/README.md +++ b/Samples/2_Concepts_and_Techniques/radixSortThrust/README.md @@ -27,7 +27,7 @@ cudaEventSynchronize, cudaEventRecord, cudaGetDevice, cudaEventDestroy, cudaEven ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/reduction/README.md b/Samples/2_Concepts_and_Techniques/reduction/README.md index d83daf85..d907046c 100644 --- a/Samples/2_Concepts_and_Techniques/reduction/README.md +++ b/Samples/2_Concepts_and_Techniques/reduction/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaSetDevice, cudaDeviceSynchronize, cudaGetDevice, cudaM ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/README.md b/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/README.md index 95381768..a5e0fb17 100644 --- a/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/README.md +++ b/Samples/2_Concepts_and_Techniques/reductionMultiBlockCG/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaSetDevice, cudaDeviceSynchronize, cudaLaunchCooperativ ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/scalarProd/README.md b/Samples/2_Concepts_and_Techniques/scalarProd/README.md index ce1b230e..a54a6617 100644 --- a/Samples/2_Concepts_and_Techniques/scalarProd/README.md +++ b/Samples/2_Concepts_and_Techniques/scalarProd/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/scan/README.md b/Samples/2_Concepts_and_Techniques/scan/README.md index 6cf37742..6a7660ff 100644 --- a/Samples/2_Concepts_and_Techniques/scan/README.md +++ b/Samples/2_Concepts_and_Techniques/scan/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/README.md b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/README.md index fb45c58a..544676a4 100644 --- a/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/README.md +++ b/Samples/2_Concepts_and_Techniques/segmentationTreeThrust/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaMemGetInfo, cudaEventSynchronize, cudaEventRecord, cudaMemset, c ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/shfl_scan/README.md b/Samples/2_Concepts_and_Techniques/shfl_scan/README.md index 2dcd7849..2e80ddf3 100644 --- a/Samples/2_Concepts_and_Techniques/shfl_scan/README.md +++ b/Samples/2_Concepts_and_Techniques/shfl_scan/README.md @@ -28,7 +28,7 @@ cudaMemcpy, cudaFree, cudaMallocHost, cudaEventSynchronize, cudaEventRecord, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/sortingNetworks/README.md b/Samples/2_Concepts_and_Techniques/sortingNetworks/README.md index c0d55d57..04111d49 100644 --- a/Samples/2_Concepts_and_Techniques/sortingNetworks/README.md +++ b/Samples/2_Concepts_and_Techniques/sortingNetworks/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/README.md b/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/README.md index 28bf55da..53adf841 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/README.md +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocation/README.md @@ -27,7 +27,7 @@ cudaDeviceGetDefaultMemPool, cudaFreeAsync, cudaStreamCreateWithFlags, cudaStrea ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md index 97b3b7f4..da2d5e80 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationIPC/README.md @@ -30,7 +30,7 @@ cudaDeviceGetAttribute, cudaMemPoolImportFromShareableHandle, cudaSetDevice, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/README.md b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/README.md index 00edba7e..9a8a380e 100644 --- a/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/README.md +++ b/Samples/2_Concepts_and_Techniques/streamOrderedAllocationP2P/README.md @@ -27,7 +27,7 @@ cudaDeviceGetDefaultMemPool, cudaFreeAsync, cudaStreamCreateWithFlags, cudaMemPo ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/threadFenceReduction/README.md b/Samples/2_Concepts_and_Techniques/threadFenceReduction/README.md index 910016d8..6f972242 100644 --- a/Samples/2_Concepts_and_Techniques/threadFenceReduction/README.md +++ b/Samples/2_Concepts_and_Techniques/threadFenceReduction/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMalloc, cudaGetDeviceProperties ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/2_Concepts_and_Techniques/threadMigration/README.md b/Samples/2_Concepts_and_Techniques/threadMigration/README.md index 5d716ed6..5ff41a35 100644 --- a/Samples/2_Concepts_and_Techniques/threadMigration/README.md +++ b/Samples/2_Concepts_and_Techniques/threadMigration/README.md @@ -27,7 +27,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuModuleLoadData, cuDeviceGetName, cuDeviceGet, cu ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/StreamPriorities/README.md b/Samples/3_CUDA_Features/StreamPriorities/README.md index 46ff019e..e17f07fa 100644 --- a/Samples/3_CUDA_Features/StreamPriorities/README.md +++ b/Samples/3_CUDA_Features/StreamPriorities/README.md @@ -28,7 +28,7 @@ cudaMemcpy, cudaStreamCreateWithPriority, cudaDeviceGetStreamPriorityRange, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/bf16TensorCoreGemm/README.md b/Samples/3_CUDA_Features/bf16TensorCoreGemm/README.md index fd0c4449..2ef302de 100644 --- a/Samples/3_CUDA_Features/bf16TensorCoreGemm/README.md +++ b/Samples/3_CUDA_Features/bf16TensorCoreGemm/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaGetErrorString, cudaGetLastError, cudaEventSynchronize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/binaryPartitionCG/README.md b/Samples/3_CUDA_Features/binaryPartitionCG/README.md index a708a8e8..3190444e 100644 --- a/Samples/3_CUDA_Features/binaryPartitionCG/README.md +++ b/Samples/3_CUDA_Features/binaryPartitionCG/README.md @@ -27,7 +27,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/bindlessTexture/README.md b/Samples/3_CUDA_Features/bindlessTexture/README.md index abc00862..5938dd4f 100644 --- a/Samples/3_CUDA_Features/bindlessTexture/README.md +++ b/Samples/3_CUDA_Features/bindlessTexture/README.md @@ -28,7 +28,7 @@ cudaMemcpy, cudaGetMipmappedArrayLevel, cudaGraphicsMapResources, cudaDestroySur ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/README.md b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/README.md index 3abaae21..65ab828d 100644 --- a/Samples/3_CUDA_Features/cdpAdvancedQuicksort/README.md +++ b/Samples/3_CUDA_Features/cdpAdvancedQuicksort/README.md @@ -28,7 +28,7 @@ cudaStreamCreateWithFlags, cudaMemcpy, cudaMemcpyAsync, cudaFree, cudaGetErrorSt ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/cdpBezierTessellation/README.md b/Samples/3_CUDA_Features/cdpBezierTessellation/README.md index 2bd28946..f062a285 100644 --- a/Samples/3_CUDA_Features/cdpBezierTessellation/README.md +++ b/Samples/3_CUDA_Features/cdpBezierTessellation/README.md @@ -28,7 +28,7 @@ cudaMemcpy, cudaFree, cudaGetDeviceCount, cudaMalloc, cudaGetDeviceProperties ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/cdpQuadtree/README.md b/Samples/3_CUDA_Features/cdpQuadtree/README.md index d17ece52..c0aa1a0d 100644 --- a/Samples/3_CUDA_Features/cdpQuadtree/README.md +++ b/Samples/3_CUDA_Features/cdpQuadtree/README.md @@ -28,7 +28,7 @@ cudaMemcpy, cudaFree, cudaGetLastError, cudaDeviceSetLimit, cudaMalloc, cudaGetD ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/cdpSimplePrint/README.md b/Samples/3_CUDA_Features/cdpSimplePrint/README.md index 1b8ef8cc..7fc2705a 100644 --- a/Samples/3_CUDA_Features/cdpSimplePrint/README.md +++ b/Samples/3_CUDA_Features/cdpSimplePrint/README.md @@ -28,7 +28,7 @@ cudaDeviceSynchronize, cudaGetLastError, cudaGetDeviceProperties, cudaDeviceSetL ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/cdpSimpleQuicksort/README.md b/Samples/3_CUDA_Features/cdpSimpleQuicksort/README.md index c1f9a58b..d029aeef 100644 --- a/Samples/3_CUDA_Features/cdpSimpleQuicksort/README.md +++ b/Samples/3_CUDA_Features/cdpSimpleQuicksort/README.md @@ -28,7 +28,7 @@ cudaStreamCreateWithFlags, cudaMemcpy, cudaStreamDestroy, cudaFree, cudaDeviceSy ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/cudaCompressibleMemory/README.md b/Samples/3_CUDA_Features/cudaCompressibleMemory/README.md index 30e5ed8e..8857619b 100644 --- a/Samples/3_CUDA_Features/cudaCompressibleMemory/README.md +++ b/Samples/3_CUDA_Features/cudaCompressibleMemory/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaEventSynchronize, cudaEventRecord, cudaEventElapsedTime, cudaOcc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/cudaTensorCoreGemm/README.md b/Samples/3_CUDA_Features/cudaTensorCoreGemm/README.md index 2bc888f3..b76a5448 100644 --- a/Samples/3_CUDA_Features/cudaTensorCoreGemm/README.md +++ b/Samples/3_CUDA_Features/cudaTensorCoreGemm/README.md @@ -31,7 +31,7 @@ cudaMemcpy, cudaFree, cudaGetErrorString, cudaGetLastError, cudaEventSynchronize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/dmmaTensorCoreGemm/README.md b/Samples/3_CUDA_Features/dmmaTensorCoreGemm/README.md index bb032258..fccbaa47 100644 --- a/Samples/3_CUDA_Features/dmmaTensorCoreGemm/README.md +++ b/Samples/3_CUDA_Features/dmmaTensorCoreGemm/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaGetErrorString, cudaGetLastError, cudaEventSynchronize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/globalToShmemAsyncCopy/README.md b/Samples/3_CUDA_Features/globalToShmemAsyncCopy/README.md index 9eb8abbb..547e5be4 100644 --- a/Samples/3_CUDA_Features/globalToShmemAsyncCopy/README.md +++ b/Samples/3_CUDA_Features/globalToShmemAsyncCopy/README.md @@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaMalloc, cudaDeviceGetAttribute, cudaFree, cudaMal ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/graphMemoryFootprint/README.md b/Samples/3_CUDA_Features/graphMemoryFootprint/README.md index b41dcaf5..96511035 100644 --- a/Samples/3_CUDA_Features/graphMemoryFootprint/README.md +++ b/Samples/3_CUDA_Features/graphMemoryFootprint/README.md @@ -27,7 +27,7 @@ cudaGraphAddMemAllocNode, cudaStreamCreateWithFlags, cudaGraphInstantiate, cudaS ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/graphMemoryNodes/README.md b/Samples/3_CUDA_Features/graphMemoryNodes/README.md index bf6f7896..84c89620 100644 --- a/Samples/3_CUDA_Features/graphMemoryNodes/README.md +++ b/Samples/3_CUDA_Features/graphMemoryNodes/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaDeviceGetAttribute, cudaDriverGetVersion, cudaGraphLaunch, cudaE ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/immaTensorCoreGemm/README.md b/Samples/3_CUDA_Features/immaTensorCoreGemm/README.md index 5ddc26e5..b7e58146 100644 --- a/Samples/3_CUDA_Features/immaTensorCoreGemm/README.md +++ b/Samples/3_CUDA_Features/immaTensorCoreGemm/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaGetErrorString, cudaGetLastError, cudaEventSynchronize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/jacobiCudaGraphs/README.md b/Samples/3_CUDA_Features/jacobiCudaGraphs/README.md index 2e40f933..a2d1b8d2 100644 --- a/Samples/3_CUDA_Features/jacobiCudaGraphs/README.md +++ b/Samples/3_CUDA_Features/jacobiCudaGraphs/README.md @@ -25,7 +25,7 @@ cudaExtent, cudaGraphLaunch, cudaGraphAddMemcpyNode, cudaMallocHost, cudaPitched ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/memMapIPCDrv/README.md b/Samples/3_CUDA_Features/memMapIPCDrv/README.md index f46c54ac..969dd325 100644 --- a/Samples/3_CUDA_Features/memMapIPCDrv/README.md +++ b/Samples/3_CUDA_Features/memMapIPCDrv/README.md @@ -30,7 +30,7 @@ cuDeviceCanAccessPeer, cuMemImportFromShareableHandle, cuModuleLoadDataEx, cuMod ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/newdelete/README.md b/Samples/3_CUDA_Features/newdelete/README.md index 0ca91dd0..43e2b413 100644 --- a/Samples/3_CUDA_Features/newdelete/README.md +++ b/Samples/3_CUDA_Features/newdelete/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaDeviceSetLimit, cudaMalloc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/ptxjit/README.md b/Samples/3_CUDA_Features/ptxjit/README.md index 49c618ff..719a6d2c 100644 --- a/Samples/3_CUDA_Features/ptxjit/README.md +++ b/Samples/3_CUDA_Features/ptxjit/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaDriverGetVersion, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/simpleCudaGraphs/README.md b/Samples/3_CUDA_Features/simpleCudaGraphs/README.md index 22236fea..be078627 100644 --- a/Samples/3_CUDA_Features/simpleCudaGraphs/README.md +++ b/Samples/3_CUDA_Features/simpleCudaGraphs/README.md @@ -25,7 +25,7 @@ cudaGraphClone, cudaExtent, cudaGraphLaunch, cudaStreamCreate, cudaLaunchHostFun ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/3_CUDA_Features/tf32TensorCoreGemm/README.md b/Samples/3_CUDA_Features/tf32TensorCoreGemm/README.md index 017ed53b..f5a9d6fb 100644 --- a/Samples/3_CUDA_Features/tf32TensorCoreGemm/README.md +++ b/Samples/3_CUDA_Features/tf32TensorCoreGemm/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaGetErrorString, cudaGetLastError, cudaEventSynchronize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/README.md b/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/README.md index 8438858f..915fb8f8 100644 --- a/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/README.md +++ b/Samples/3_CUDA_Features/warpAggregatedAtomicsCG/README.md @@ -25,7 +25,7 @@ cudaMemcpy, cudaFree, cudaDeviceGetAttribute, cudaMemset, cudaMalloc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/README.md b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/README.md index d21737f9..345b9a7a 100644 --- a/Samples/4_CUDA_Libraries/FilterBorderControlNPP/README.md +++ b/Samples/4_CUDA_Libraries/FilterBorderControlNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaDeviceReset, cudaSetDevice, cudaGetDeviceCount, cudaD ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/README.md b/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/README.md index 907b0169..b5c08c47 100644 --- a/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/README.md +++ b/Samples/4_CUDA_Libraries/MersenneTwisterGP11213/README.md @@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaMallocHost, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/batchCUBLAS/README.md b/Samples/4_CUDA_Libraries/batchCUBLAS/README.md index c9573303..f913e9d5 100644 --- a/Samples/4_CUDA_Libraries/batchCUBLAS/README.md +++ b/Samples/4_CUDA_Libraries/batchCUBLAS/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaGetLastError, cudaDeviceSynchroniz ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/batchedLabelMarkersAndLabelCompressionNPP/README.md b/Samples/4_CUDA_Libraries/batchedLabelMarkersAndLabelCompressionNPP/README.md index b4d2969d..62706878 100644 --- a/Samples/4_CUDA_Libraries/batchedLabelMarkersAndLabelCompressionNPP/README.md +++ b/Samples/4_CUDA_Libraries/batchedLabelMarkersAndLabelCompressionNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaMallocPitch, cudaFree, cudaDeviceGetAttribute, cudaMa ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/boxFilterNPP/README.md b/Samples/4_CUDA_Libraries/boxFilterNPP/README.md index 84f2ea24..d4a0683f 100644 --- a/Samples/4_CUDA_Libraries/boxFilterNPP/README.md +++ b/Samples/4_CUDA_Libraries/boxFilterNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaDriverGetVersion ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/README.md b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/README.md index 080aa680..e9e12ac4 100644 --- a/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/README.md +++ b/Samples/4_CUDA_Libraries/cannyEdgeDetectorNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaFree, cudaSetDevice, cudaGetDeviceCount, cudaDeviceIn ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/conjugateGradient/README.md b/Samples/4_CUDA_Libraries/conjugateGradient/README.md index dd76befa..655fbd9e 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradient/README.md +++ b/Samples/4_CUDA_Libraries/conjugateGradient/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMalloc, cudaGetDeviceProperties ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/README.md b/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/README.md index d3b0df6b..975abf4d 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/README.md +++ b/Samples/4_CUDA_Libraries/conjugateGradientCudaGraphs/README.md @@ -30,7 +30,7 @@ cudaGraphInstantiate, cudaStreamDestroy, cudaStreamBeginCapture, cudaFree, cudaM ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/README.md b/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/README.md index f1328c74..3cdd56fe 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/README.md +++ b/Samples/4_CUDA_Libraries/conjugateGradientMultiBlockCG/README.md @@ -30,7 +30,7 @@ cudaFree, cudaMallocManaged, cudaDeviceSynchronize, cudaEventRecord, cudaLaunchC ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/README.md b/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/README.md index d13bf9eb..b162b598 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/README.md +++ b/Samples/4_CUDA_Libraries/conjugateGradientMultiDeviceCG/README.md @@ -30,7 +30,7 @@ cudaHostAlloc, cudaMemPrefetchAsync, cudaFree, cudaLaunchCooperativeKernel, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/conjugateGradientPrecond/README.md b/Samples/4_CUDA_Libraries/conjugateGradientPrecond/README.md index ae502c7b..c5a203e3 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientPrecond/README.md +++ b/Samples/4_CUDA_Libraries/conjugateGradientPrecond/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaMemset, cudaMalloc, cudaGetDeviceProperties ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/conjugateGradientUM/README.md b/Samples/4_CUDA_Libraries/conjugateGradientUM/README.md index 5c9825dd..988655bc 100644 --- a/Samples/4_CUDA_Libraries/conjugateGradientUM/README.md +++ b/Samples/4_CUDA_Libraries/conjugateGradientUM/README.md @@ -28,7 +28,7 @@ cudaFree, cudaMallocManaged, cudaDeviceSynchronize, cudaMalloc, cudaGetDevicePro ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuDLAErrorReporting/README.md b/Samples/4_CUDA_Libraries/cuDLAErrorReporting/README.md index 82e0bbab..61fddcbf 100644 --- a/Samples/4_CUDA_Libraries/cuDLAErrorReporting/README.md +++ b/Samples/4_CUDA_Libraries/cuDLAErrorReporting/README.md @@ -27,7 +27,7 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSe ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuDLAHybridMode/README.md b/Samples/4_CUDA_Libraries/cuDLAHybridMode/README.md index f90fedd9..355601a2 100644 --- a/Samples/4_CUDA_Libraries/cuDLAHybridMode/README.md +++ b/Samples/4_CUDA_Libraries/cuDLAHybridMode/README.md @@ -27,7 +27,7 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSe ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md index d3df1858..85708fdf 100644 --- a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/README.md @@ -27,7 +27,7 @@ cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaGetErrorName, cudaSe ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu new file mode 100644 index 00000000..81e575c6 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsHybrid/main.cu @@ -0,0 +1,898 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cudla.h" +#include "cuda_runtime.h" +#include "cudlaExternalEtbl.hpp" + +#include +#include +#include +#include +#include +#include + +#define MAX_FILENAME_LEN 200 +#define RESERVED_SUFFIX_LEN 10 + +#define DPRINTF(...) printf(__VA_ARGS__) + +static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); + + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", tensorDesc->n, tensorDesc->c, + tensorDesc->h, tensorDesc->w); + + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); +} + +typedef struct { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char* loadableData; + cudaStream_t stream; + uint32_t numInputTensors; + uint32_t numOutputTensors; + uint32_t numOutputTaskStatistics; + unsigned char** inputBuffer; + unsigned char** outputBuffer; + unsigned char** statisticsOutputBuffer; + void** inputBufferGPU; + void** outputBufferGPU; + void** outputTaskStatisticsGPU; + void **csv; + cudlaModuleTensorDescriptor* inputTensorDesc; + cudlaModuleTensorDescriptor* outputTensorDesc; + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc; + uint64_t** inputBufferRegisteredPtr; + uint64_t** outputBufferRegisteredPtr; + uint64_t** outputTaskStatisticsRegisteredPtr; + uint64_t** outputStatisticsBufferRegisteredPtr; +} ResourceList; + +void cleanUp(ResourceList* resourceList); + +void cleanUp(ResourceList* resourceList) { + uint32_t ii = 0; + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } + + if (resourceList->outputTaskStatisticsDesc != NULL) { + free(resourceList->outputTaskStatisticsDesc); + resourceList->outputTaskStatisticsDesc = NULL; + } + + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } + + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } + + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } + + if (resourceList->inputBufferGPU != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if ((resourceList->inputBufferGPU)[ii] != NULL) { + cudaFree((resourceList->inputBufferGPU)[ii]); + (resourceList->inputBufferGPU)[ii] = NULL; + } + } + free(resourceList->inputBufferGPU); + resourceList->inputBufferGPU = NULL; + } + + if (resourceList->outputBufferGPU != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if ((resourceList->outputBufferGPU)[ii] != NULL) { + cudaFree((resourceList->outputBufferGPU)[ii]); + (resourceList->outputBufferGPU)[ii] = NULL; + } + } + free(resourceList->outputBufferGPU); + resourceList->outputBufferGPU = NULL; + } + + if (resourceList->outputTaskStatisticsGPU != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->outputTaskStatisticsGPU)[ii] != NULL) { + cudaFree((resourceList->outputTaskStatisticsGPU)[ii]); + (resourceList->outputTaskStatisticsGPU)[ii] = NULL; + } + } + free(resourceList->outputTaskStatisticsGPU); + resourceList->outputTaskStatisticsGPU = NULL; + } + + if (resourceList->csv != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->csv)[ii] != NULL) + { + free((resourceList->csv)[ii]); + (resourceList->csv)[ii] = NULL; + } + } + free(resourceList->csv); + resourceList->csv = NULL; + } + + if (resourceList->inputBuffer != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if ((resourceList->inputBuffer)[ii] != NULL) { + free((resourceList->inputBuffer)[ii]); + (resourceList->inputBuffer)[ii] = NULL; + } + } + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + + if (resourceList->outputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if ((resourceList->outputBuffer)[ii] != NULL) + { + free((resourceList->outputBuffer)[ii]); + (resourceList->outputBuffer)[ii] = NULL; + } + } + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } + + if (resourceList->statisticsOutputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->statisticsOutputBuffer)[ii] != NULL) { + free((resourceList->statisticsOutputBuffer)[ii]); + (resourceList->statisticsOutputBuffer)[ii] = NULL; + } + } + free(resourceList->statisticsOutputBuffer); + resourceList->statisticsOutputBuffer = NULL; + } + + if (resourceList->stream != NULL) { + cudaStreamDestroy(resourceList->stream); + resourceList->stream = NULL; + } + + if (resourceList->inputBufferRegisteredPtr != NULL) { + free(resourceList->inputBufferRegisteredPtr); + resourceList->inputBufferRegisteredPtr = NULL; + } + + if (resourceList->outputBufferRegisteredPtr != NULL) { + free(resourceList->outputBufferRegisteredPtr); + resourceList->outputBufferRegisteredPtr = NULL; + } + + if (resourceList->outputTaskStatisticsRegisteredPtr != NULL) { + free(resourceList->outputTaskStatisticsRegisteredPtr); + resourceList->outputTaskStatisticsRegisteredPtr = NULL; + } + + if (resourceList->outputStatisticsBufferRegisteredPtr != NULL) { + free(resourceList->outputStatisticsBufferRegisteredPtr); + resourceList->outputStatisticsBufferRegisteredPtr = NULL; + } + + resourceList->numInputTensors = 0; + resourceList->numOutputTensors = 0; + resourceList->numOutputTaskStatistics = 0; +} + +int main(int argc, char** argv) { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + uint32_t statSupport = 0; + uint32_t dlaFreqInMHz = 0; + FILE* fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; + char filename[MAX_FILENAME_LEN]; + const char* suffix = ".csv"; + + cudaStream_t stream; + cudaError_t result; + const char* errPtr = NULL; + + ResourceList resourceList; + + memset(&resourceList, 0x00, sizeof(ResourceList)); + + if ((argc != 4) && (argc != 5)) { + DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_hybrid_test1 \n"); + return 1; + } + + if (argc == 5) { + if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) + { + DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); + return 1; + } + } + + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } + + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } + + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); + + dlaFreqInMHz = atoi(argv[2]); + statSupport = atoi(argv[3]); + + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } + + actually_read = fread(loadableData, 1, file_size, fp); + if ( actually_read != file_size ) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + // Initialize CUDA. + result = cudaFree(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaFree = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + result = cudaSetDevice(0); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cudaSetDevice = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + err = cudlaCreateDevice(0, &devHandle, CUDLA_CUDA_DLA); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + + // Create CUDA stream. + result = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in creating cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + resourceList.stream = stream; + + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + uint32_t numOutputTaskStatistics = 0; + + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + // using the same attributes to get num_output_task_statistics_tensors + attribute.numOutputTensors = 0; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTaskStatistics = attribute.numOutputTensors; + DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics); + + if(numOutputTaskStatistics == 0) { + DPRINTF("Layerwise stats is not supported for this Loadable \n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.numInputTensors = numInputTensors; + resourceList.numOutputTensors = numOutputTensors; + resourceList.numOutputTaskStatistics = numOutputTaskStatistics; + + cudlaModuleTensorDescriptor* inputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors); + cudlaModuleTensorDescriptor* outputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics); + if (outputTaskStatisticsDesc == NULL) { + free(outputTaskStatisticsDesc); + outputTaskStatisticsDesc = NULL; + cleanUp(&resourceList); + return 1; + } + + resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_INPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + attribute.outputTensorDesc = outputTaskStatisticsDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting task statistics descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Printing output task statistics descriptor size\n"); + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size); + } + + // Setup the input and output buffers which will be used as an input to CUDA. + unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors); + resourceList.inputBuffer = inputBuffer; + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size); + if (inputBuffer[ii] == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size); + } + + unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors); + resourceList.outputBuffer = outputBuffer; + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size); + } + + unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics); + if (statisticsOutputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics); + resourceList.statisticsOutputBuffer = statisticsOutputBuffer; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size); + } + + // Allocate memory on GPU. + void** inputBufferGPU = (void **)malloc(sizeof(void *)*numInputTensors); + if (inputBufferGPU == NULL) { + DPRINTF("Error in allocating memory for input buffer GPU array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBufferGPU, 0x00, sizeof(void *)*numInputTensors); + resourceList.inputBufferGPU = inputBufferGPU; + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + result = cudaMalloc(&(inputBufferGPU[ii]), inputTensorDesc[ii].size); + if (result != cudaSuccess) + { + DPRINTF("Error in allocating input memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + void** outputBufferGPU = (void **)malloc(sizeof(void *)*numOutputTensors); + if (outputBufferGPU == NULL) { + DPRINTF("Error in allocating memory for output buffer GPU array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBufferGPU, 0x00, sizeof(void *)*numOutputTensors); + resourceList.outputBufferGPU = outputBufferGPU; + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + result = cudaMalloc(&(outputBufferGPU[ii]), outputTensorDesc[ii].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating output memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + void** outputTaskStatisticsGPU = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + if (outputTaskStatisticsGPU == NULL) { + DPRINTF("Error in allocating memory for output task statistics GPU array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputTaskStatisticsGPU, 0x00, sizeof(void *)*numOutputTaskStatistics); + resourceList.outputTaskStatisticsGPU = outputTaskStatisticsGPU; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + result = cudaMalloc(&(outputTaskStatisticsGPU[ii]), outputTaskStatisticsDesc[ii].size); + if (result != cudaSuccess) { + DPRINTF("Error in allocating task statistics memory on GPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + uint64_t** inputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors); + uint64_t** outputBufferRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors); + uint64_t** outputTaskStatisticsRegisteredPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics); + + if ((inputBufferRegisteredPtr == NULL) || (outputBufferRegisteredPtr == NULL) || (outputTaskStatisticsRegisteredPtr == NULL)) { + if (inputBufferRegisteredPtr != NULL) { + free(inputBufferRegisteredPtr); + inputBufferRegisteredPtr = NULL; + } + + if (outputBufferRegisteredPtr != NULL) { + free(outputBufferRegisteredPtr); + outputBufferRegisteredPtr = NULL; + } + + if (outputTaskStatisticsRegisteredPtr != NULL) { + free(outputTaskStatisticsRegisteredPtr); + outputTaskStatisticsRegisteredPtr = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufferRegisteredPtr = inputBufferRegisteredPtr; + resourceList.outputBufferRegisteredPtr = outputBufferRegisteredPtr; + resourceList.outputTaskStatisticsRegisteredPtr = outputTaskStatisticsRegisteredPtr; + + // Register the CUDA-allocated buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = cudlaMemRegister(devHandle, + (uint64_t* )(inputBufferGPU[ii]), + inputTensorDesc[ii].size, + &(inputBufferRegisteredPtr[ii]), + 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = cudlaMemRegister(devHandle, + (uint64_t* )(outputBufferGPU[ii]), + outputTensorDesc[ii].size, + &(outputBufferRegisteredPtr[ii]), + 0); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = cudlaMemRegister(devHandle, + (uint64_t* )(outputTaskStatisticsGPU[ii]), + outputTaskStatisticsDesc[ii].size, + &(outputTaskStatisticsRegisteredPtr[ii]), + CUDLA_TASK_STATISTICS); + if (err != cudlaSuccess) { + DPRINTF("Error in registering statistics output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Copy data from CPU buffers to GPU buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + result = cudaMemcpyAsync(inputBufferGPU[ii], inputBuffer[ii], inputTensorDesc[ii].size, cudaMemcpyHostToDevice, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memcpy for input\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + result = cudaMemsetAsync(outputBufferGPU[ii], 0, outputTensorDesc[ii].size, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memset for output\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + result = cudaMemsetAsync(outputTaskStatisticsGPU[ii], 0, outputTaskStatisticsDesc[ii].size, stream); + if (result != cudaSuccess) { + DPRINTF("Error in enqueueing memset for statistics output\n"); + cleanUp(&resourceList); + return 1; + } + } + + uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0}; + uint32_t index = 0; + for (; index < numOutputTensors ; index++) { + outputStatisticsBufferRegisteredPtr[index] = ((outputBufferRegisteredPtr[index])); + } + + for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) { + outputStatisticsBufferRegisteredPtr[index++] = ((outputTaskStatisticsRegisteredPtr[jj])); + } + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr; + + if(statSupport == 1) { + task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics); + DPRINTF("Layerwise profiling is requested \n"); + } else { + task.numOutputTensors = numOutputTensors; + DPRINTF("Layerwise profiling is not requested \n"); + } + + task.numInputTensors = numInputTensors; + task.inputTensor = inputBufferRegisteredPtr; + task.waitEvents = NULL; + task.signalEvents = NULL; + + err = cudlaSubmitTask(devHandle, &task, 1, stream, 0); + if (err != cudlaSuccess) { + DPRINTF("no of output tensor %u \n",(task.numOutputTensors)); + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream = %s\n", cudaGetErrorName(result)); + cleanUp(&resourceList); + return 1; + } + + // Wait for stream operations to finish and bring output buffer to CPU. + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + result = cudaMemcpyAsync(outputBuffer[ii], outputBufferGPU[ii], + outputTensorDesc[ii].size, cudaMemcpyDeviceToHost, stream); + if (result != cudaSuccess) { + DPRINTF("Error in bringing result back to CPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream\n"); + cleanUp(&resourceList); + return 1; + } + + if(statSupport == 1) { + // copy statistics data to cpu + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + result = cudaMemcpyAsync(statisticsOutputBuffer[ii], outputTaskStatisticsGPU[ii], + outputTaskStatisticsDesc[ii].size, cudaMemcpyDeviceToHost, stream); + if (result != cudaSuccess) { + DPRINTF("Error in bringing result back to CPU\n"); + cleanUp(&resourceList); + return 1; + } + } + + result = cudaStreamSynchronize(stream); + if (result != cudaSuccess) { + DPRINTF("Error in synchronizing stream\n"); + cleanUp(&resourceList); + return 1; + } + + // To get the last index of the filename prefix in which statistics will be dumped + uint32_t index = 0; + if (argc == 5) { + while(argv[4][index]!='\0') { + index++; + } + } + + const cudlaExternalEtbl* etbl = NULL; + if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) { + DPRINTF("Error in getting export table\n"); + cleanUp(&resourceList); + return 1; + } + + void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + if (csv == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics); + resourceList.csv = csv; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + cudlaTranslateCsvAttribute csvAttribute; + uint64_t csvStreamLength = 0; + + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute); + csv[ii] = (void* )malloc(csvAttribute.csvStreamLength); + csvStreamLength = csvAttribute.csvStreamLength; + DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength); + + if (csv[ii] == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv[ii], 0x00, csvAttribute.csvStreamLength); + + csvAttribute.csvStreamStats = csv[ii]; + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute); + if (err != cudlaSuccess) { + DPRINTF("Error in translating stats\n"); + cleanUp(&resourceList); + return 1; + } + + if (argc == 5) { + sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix); + fp = fopen(filename, "w+"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", filename); + cleanUp(&resourceList); + return 1; + } + + uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp); + if(ret_val != csvStreamLength) { + DPRINTF("number of elements written to file is %u \n", ret_val); + cleanUp(&resourceList); + return 1; + } + fclose(fp); + } else { + DPRINTF("%s \n",(char *)csv[ii]); + } + } + } + + // unregister the CUDA-allocated buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (inputBufferRegisteredPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (outputBufferRegisteredPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = cudlaMemUnregister(devHandle, + (outputTaskStatisticsRegisteredPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); + + result = cudaStreamDestroy(stream); + if (result != cudaSuccess) { + errPtr = cudaGetErrorName(result); + DPRINTF("Error in destroying cuda stream = %s\n", errPtr); + cleanUp(&resourceList); + return 1; + } + + resourceList.stream = NULL; + + err = cudlaModuleUnload(moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleUnload = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully unloaded module\n"); + } + + resourceList.moduleHandle = NULL; + + err = cudlaDestroyDevice(devHandle); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA destroy device = %d\n", err); + return 1; + } + DPRINTF("Device destroyed successfully\n"); + + resourceList.devHandle = NULL; + + cleanUp(&resourceList); + + DPRINTF("cuDLALayerwiseStatsHybrid DONE !!!\n"); + + return 0; +} diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md index ec12c923..2e54fefd 100644 --- a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/README.md @@ -27,7 +27,7 @@ aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp new file mode 100644 index 00000000..639e7889 --- /dev/null +++ b/Samples/4_CUDA_Libraries/cuDLALayerwiseStatsStandalone/main.cpp @@ -0,0 +1,1348 @@ +/* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cudla.h" +#include "nvscierror.h" +#include "nvscibuf.h" +#include "nvscisync.h" +#include "cudlaExternalEtbl.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_FILENAME_LEN 200 +#define RESERVED_SUFFIX_LEN 10 + +#define DPRINTF(...) printf(__VA_ARGS__) + +static void printTensorDesc(cudlaModuleTensorDescriptor* tensorDesc) { + DPRINTF("\tTENSOR NAME : %s\n", tensorDesc->name); + DPRINTF("\tsize: %lu\n", tensorDesc->size); + + DPRINTF("\tdims: [%lu, %lu, %lu, %lu]\n", + tensorDesc->n, + tensorDesc->c, + tensorDesc->h, + tensorDesc->w); + + DPRINTF("\tdata fmt: %d\n", tensorDesc->dataFormat); + DPRINTF("\tdata type: %d\n", tensorDesc->dataType); + DPRINTF("\tdata category: %d\n", tensorDesc->dataCategory); + DPRINTF("\tpixel fmt: %d\n", tensorDesc->pixelFormat); + DPRINTF("\tpixel mapping: %d\n", tensorDesc->pixelMapping); + DPRINTF("\tstride[0]: %d\n", tensorDesc->stride[0]); + DPRINTF("\tstride[1]: %d\n", tensorDesc->stride[1]); + DPRINTF("\tstride[2]: %d\n", tensorDesc->stride[2]); + DPRINTF("\tstride[3]: %d\n", tensorDesc->stride[3]); +} + +typedef struct { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + unsigned char* loadableData; + uint32_t numInputTensors; + uint32_t numOutputTensors; + uint32_t numOutputTaskStatistics; + unsigned char** inputBuffer; + unsigned char** outputBuffer; + unsigned char** statisticsOutputBuffer; + cudlaModuleTensorDescriptor* inputTensorDesc; + cudlaModuleTensorDescriptor* outputTensorDesc; + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc; + NvSciBufObj* inputBufObj; + NvSciBufObj* outputBufObj; + NvSciBufObj* statisticsBufObj; + NvSciBufModule bufModule; + NvSciBufAttrList* inputAttrList; + NvSciBufAttrList* reconciledInputAttrList; + NvSciBufAttrList* inputConflictList; + NvSciBufAttrList* outputAttrList; + NvSciBufAttrList* reconciledOutputAttrList; + NvSciBufAttrList* outputConflictList; + NvSciSyncObj syncObj; + NvSciSyncModule syncModule; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj; + NvSciSyncAttrList signalerAttrListObj; + NvSciSyncAttrList nvSciSyncConflictListObj; + NvSciSyncAttrList nvSciSyncReconciledListObj; + NvSciBufAttrList* statisticsOutputAttrList; + NvSciBufAttrList* reconciledStatisticsOutputAttrList; + NvSciBufAttrList* statisticsOutputConflictList; + uint64_t** inputBufObjRegPtr; + uint64_t** outputBufObjRegPtr; + uint64_t** statisticsBufObjRegPtr; + uint64_t** devPtrs; + cudlaSignalEvents* signalEvents; + NvSciSyncFence eofFence; + void **csv; +} ResourceList; + +void cleanUp(ResourceList* resourceList); + +void cleanUp(ResourceList* resourceList) { + uint32_t ii = 0; + + if (resourceList->inputTensorDesc != NULL) { + free(resourceList->inputTensorDesc); + resourceList->inputTensorDesc = NULL; + } + if (resourceList->outputTensorDesc != NULL) { + free(resourceList->outputTensorDesc); + resourceList->outputTensorDesc = NULL; + } + + if (resourceList->outputTaskStatisticsDesc != NULL) { + free(resourceList->outputTaskStatisticsDesc); + resourceList->outputTaskStatisticsDesc = NULL; + } + + if (resourceList->loadableData != NULL) { + free(resourceList->loadableData); + resourceList->loadableData = NULL; + } + + if (resourceList->moduleHandle != NULL) { + cudlaModuleUnload(resourceList->moduleHandle, 0); + resourceList->moduleHandle = NULL; + } + + if (resourceList->devHandle != NULL) { + cudlaDestroyDevice(resourceList->devHandle); + resourceList->devHandle = NULL; + } + + if (resourceList->inputBufObj != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->inputBufObj)[ii] != NULL) { + NvSciBufObjFree((resourceList->inputBufObj)[ii]); + (resourceList->inputBufObj)[ii] = NULL; + } + } + } + + if (resourceList->outputBufObj != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->outputBufObj)[ii] != NULL) { + NvSciBufObjFree((resourceList->outputBufObj)[ii]); + (resourceList->outputBufObj)[ii] = NULL; + } + } + } + + if (resourceList->statisticsBufObj != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->statisticsBufObj)[ii] != NULL) { + NvSciBufObjFree((resourceList->statisticsBufObj)[ii]); + (resourceList->statisticsBufObj)[ii] = NULL; + } + } + } + + if (resourceList->inputBuffer != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if ((resourceList->inputBuffer)[ii] != NULL) { + free((resourceList->inputBuffer)[ii]); + (resourceList->inputBuffer)[ii] = NULL; + } + } + free(resourceList->inputBuffer); + resourceList->inputBuffer = NULL; + } + + if (resourceList->outputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if ((resourceList->outputBuffer)[ii] != NULL) { + free((resourceList->outputBuffer)[ii]); + (resourceList->outputBuffer)[ii] = NULL; + } + } + free(resourceList->outputBuffer); + resourceList->outputBuffer = NULL; + } + + if (resourceList->statisticsOutputBuffer != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->statisticsOutputBuffer)[ii] != NULL) { + free((resourceList->statisticsOutputBuffer)[ii]); + (resourceList->statisticsOutputBuffer)[ii] = NULL; + } + } + free(resourceList->statisticsOutputBuffer); + resourceList->statisticsOutputBuffer = NULL; + } + + if (resourceList->csv != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if ((resourceList->csv)[ii] != NULL) { + free((resourceList->csv)[ii]); + (resourceList->csv)[ii] = NULL; + } + } + free(resourceList->csv); + resourceList->csv = NULL; + } + + if (resourceList->reconciledInputAttrList != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->reconciledInputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->reconciledInputAttrList)[ii]); + (resourceList->reconciledInputAttrList)[ii] = NULL; + } + } + free(resourceList->reconciledInputAttrList); + resourceList->reconciledInputAttrList = NULL; + } + + if (resourceList->inputConflictList != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->inputConflictList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->inputConflictList)[ii]); + (resourceList->inputConflictList)[ii] = NULL; + } + } + free(resourceList->inputConflictList); + resourceList->inputConflictList = NULL; + } + + if (resourceList->inputAttrList != NULL) { + for (ii = 0; ii < resourceList->numInputTensors; ii++) { + if((resourceList->inputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->inputAttrList)[ii]); + (resourceList->inputAttrList)[ii] = NULL; + } + } + free(resourceList->inputAttrList); + resourceList->inputAttrList = NULL; + } + + if (resourceList->reconciledOutputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->reconciledOutputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->reconciledOutputAttrList)[ii]); + (resourceList->reconciledOutputAttrList)[ii] = NULL; + } + } + free(resourceList->reconciledOutputAttrList); + resourceList->reconciledOutputAttrList = NULL; + } + + if (resourceList->outputConflictList != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->outputConflictList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->outputConflictList)[ii]); + (resourceList->outputConflictList)[ii] = NULL; + } + } + free(resourceList->outputConflictList); + resourceList->outputConflictList = NULL; + } + + if (resourceList->outputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTensors; ii++) { + if((resourceList->outputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->outputAttrList)[ii]); + (resourceList->outputAttrList)[ii] = NULL; + } + } + free(resourceList->outputAttrList); + resourceList->outputAttrList = NULL; + } + + if (resourceList->reconciledStatisticsOutputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->reconciledStatisticsOutputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->reconciledStatisticsOutputAttrList)[ii]); + (resourceList->reconciledStatisticsOutputAttrList)[ii] = NULL; + } + } + free(resourceList->reconciledStatisticsOutputAttrList); + resourceList->reconciledStatisticsOutputAttrList = NULL; + } + + if (resourceList->statisticsOutputConflictList != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->statisticsOutputConflictList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->statisticsOutputConflictList)[ii]); + (resourceList->statisticsOutputConflictList)[ii] = NULL; + } + } + free(resourceList->statisticsOutputConflictList); + resourceList->statisticsOutputConflictList = NULL; + } + + if (resourceList->statisticsOutputAttrList != NULL) { + for (ii = 0; ii < resourceList->numOutputTaskStatistics; ii++) { + if((resourceList->statisticsOutputAttrList)[ii] != NULL) { + NvSciBufAttrListFree((resourceList->statisticsOutputAttrList)[ii]); + (resourceList->statisticsOutputAttrList)[ii] = NULL; + } + } + free(resourceList->statisticsOutputAttrList); + resourceList->statisticsOutputAttrList = NULL; + } + + if (resourceList->outputBufObjRegPtr != NULL) { + free(resourceList->outputBufObjRegPtr); + resourceList->outputBufObjRegPtr = NULL; + } + + if (resourceList->statisticsBufObjRegPtr != NULL) { + free(resourceList->statisticsBufObjRegPtr); + resourceList->statisticsBufObjRegPtr = NULL; + } + + if (resourceList->inputBufObjRegPtr != NULL) { + free(resourceList->inputBufObjRegPtr); + resourceList->inputBufObjRegPtr = NULL; + } + + if (resourceList->bufModule != NULL) { + NvSciBufModuleClose(resourceList->bufModule); + resourceList->bufModule = NULL; + } + + NvSciSyncFenceClear(&(resourceList->eofFence)); + if (resourceList->syncObj != NULL) { + NvSciSyncObjFree(resourceList->syncObj); + resourceList->syncObj = NULL; + } + + if (resourceList->nvSciSyncConflictListObj != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncConflictListObj); + resourceList->nvSciSyncConflictListObj = NULL; + } + + if (resourceList->nvSciSyncReconciledListObj != NULL) { + NvSciSyncAttrListFree(resourceList->nvSciSyncReconciledListObj); + resourceList->nvSciSyncReconciledListObj = NULL; + } + + if (resourceList->signalerAttrListObj != NULL) { + NvSciSyncAttrListFree(resourceList->signalerAttrListObj); + resourceList->signalerAttrListObj = NULL; + } + + if (resourceList->waiterAttrListObj != NULL) { + NvSciSyncAttrListFree(resourceList->waiterAttrListObj); + resourceList->waiterAttrListObj = NULL; + } + + if (resourceList->nvSciCtx != NULL) { + NvSciSyncCpuWaitContextFree(resourceList->nvSciCtx); + resourceList->nvSciCtx = NULL; + } + + if (resourceList->syncModule != NULL) { + NvSciSyncModuleClose(resourceList->syncModule); + resourceList->syncModule = NULL; + } + + if (resourceList->signalEvents != NULL) { + if (resourceList->signalEvents->eofFences != NULL) { + free(resourceList->signalEvents->eofFences); + resourceList->signalEvents->eofFences = NULL; + } + free(resourceList->signalEvents); + resourceList->signalEvents = NULL; + } + + if (resourceList->devPtrs != NULL) { + free(resourceList->devPtrs); + resourceList->devPtrs = NULL; + } + + resourceList->numInputTensors = 0; + resourceList->numOutputTensors = 0; + resourceList->numOutputTaskStatistics = 0; +} + +cudlaStatus createAndSetAttrList(NvSciBufModule module, + uint64_t bufSize, + NvSciBufAttrList *attrList); + + +cudlaStatus createAndSetAttrList(NvSciBufModule module, + uint64_t bufSize, + NvSciBufAttrList *attrList) { + cudlaStatus status = cudlaSuccess; + NvSciError sciStatus = NvSciError_Success; + + sciStatus = NvSciBufAttrListCreate(module, attrList); + if (sciStatus != NvSciError_Success) { + status = cudlaErrorNvSci; + DPRINTF("Error in creating NvSciBuf attribute list\n"); + return status; + } + + // TODO: Refactor into multiple dimensions + bool needCpuAccess = true; + NvSciBufAttrValAccessPerm perm = NvSciBufAccessPerm_ReadWrite; + uint32_t dimcount = 1; + uint64_t sizes[] = {bufSize}; + uint32_t alignment[] = {1}; + uint32_t dataType = NvSciDataType_Int8; + NvSciBufType type = NvSciBufType_Tensor; + uint64_t baseAddrAlign = 512; + + NvSciBufAttrKeyValuePair setAttrs[] = { + {.key = NvSciBufGeneralAttrKey_Types, + .value = &type, + .len = sizeof(type)}, + {.key = NvSciBufTensorAttrKey_DataType, + .value = &dataType, + .len = sizeof(dataType)}, + {.key = NvSciBufTensorAttrKey_NumDims, + .value = &dimcount, + .len = sizeof(dimcount)}, + {.key = NvSciBufTensorAttrKey_SizePerDim, + .value = &sizes, + .len = sizeof(sizes)}, + {.key = NvSciBufTensorAttrKey_AlignmentPerDim, + .value = &alignment, + .len = sizeof(alignment)}, + {.key = NvSciBufTensorAttrKey_BaseAddrAlign, + .value = &baseAddrAlign, + .len = sizeof(baseAddrAlign)}, + {.key = NvSciBufGeneralAttrKey_RequiredPerm, + .value = &perm, + .len = sizeof(perm)}, + {.key = NvSciBufGeneralAttrKey_NeedCpuAccess, + .value = &needCpuAccess, + .len = sizeof(needCpuAccess)}}; + size_t length = sizeof(setAttrs) / sizeof(NvSciBufAttrKeyValuePair); + + sciStatus = NvSciBufAttrListSetAttrs(*attrList, setAttrs, length); + if (sciStatus != NvSciError_Success) + { + status = cudlaErrorNvSci; + DPRINTF("Error in setting NvSciBuf attribute list\n"); + return status; + } + + return status; +} + +NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list); + +NvSciError fillCpuWaiterAttrList(NvSciSyncAttrList list) { + bool cpuWaiter = true; + NvSciSyncAttrKeyValuePair keyValue[2]; + memset(keyValue, 0, sizeof(keyValue)); + keyValue[0].attrKey = NvSciSyncAttrKey_NeedCpuAccess; + keyValue[0].value = (void*) &cpuWaiter; + keyValue[0].len = sizeof(cpuWaiter); + NvSciSyncAccessPerm cpuPerm = NvSciSyncAccessPerm_WaitOnly; + keyValue[1].attrKey = NvSciSyncAttrKey_RequiredPerm; + keyValue[1].value = (void*) &cpuPerm; + keyValue[1].len = sizeof(cpuPerm); + return NvSciSyncAttrListSetAttrs(list, keyValue, 2); +} + +int main(int argc, char** argv) { + cudlaDevHandle devHandle; + cudlaModule moduleHandle; + cudlaStatus err; + uint32_t statSupport = 0; + uint32_t dlaFreqInMHz = 0; + FILE* fp = NULL; + struct stat st; + size_t file_size; + size_t actually_read = 0; + unsigned char *loadableData = NULL; + char filename[MAX_FILENAME_LEN]; + const char* suffix = ".csv"; + + + ResourceList resourceList; + + memset(&resourceList, 0x00, sizeof(ResourceList)); + + if ((argc != 4) && (argc != 5)) { + DPRINTF("Usage : ./test_cudla_layerwise_stats_L0_standalone_test1 \n"); + return 1; + } + + if (argc == 5) { + if((strlen(argv[4])) > (MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)) { + DPRINTF("Filename prefix length is too big, greater than maximum permissible prefix length of %u \n",(MAX_FILENAME_LEN - RESERVED_SUFFIX_LEN)); + return 1; + } + } + + // Read loadable into buffer. + fp = fopen(argv[1], "rb"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", argv[1]); + return 1; + } + + if (stat(argv[1], &st) != 0) { + DPRINTF("Cannot stat file\n"); + return 1; + } + + file_size = st.st_size; + DPRINTF("The file size = %ld\n", file_size); + + dlaFreqInMHz = atoi(argv[2]); + statSupport = atoi(argv[3]); + + loadableData = (unsigned char *)malloc(file_size); + if (loadableData == NULL) { + DPRINTF("Cannot Allocate memory for loadable\n"); + return 1; + } + + actually_read = fread(loadableData, 1, file_size, fp); + if ( actually_read != file_size ) { + free(loadableData); + DPRINTF("Read wrong size\n"); + return 1; + } + fclose(fp); + + resourceList.loadableData = loadableData; + + err = cudlaCreateDevice(0, &devHandle, CUDLA_STANDALONE); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA create device = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("Device created successfully\n"); + resourceList.devHandle = devHandle; + + err = cudlaModuleLoadFromMemory(devHandle, loadableData, file_size, &moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleLoadFromMemory = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully loaded module\n"); + } + + resourceList.moduleHandle = moduleHandle; + + // Get tensor attributes. + uint32_t numInputTensors = 0; + uint32_t numOutputTensors = 0; + uint32_t numOutputTaskStatistics = 0; + + cudlaModuleAttribute attribute; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_INPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numInputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numInputTensors = attribute.numInputTensors; + DPRINTF("numInputTensors = %d\n", numInputTensors); + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TENSORS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTensors = attribute.numOutputTensors; + DPRINTF("numOutputTensors = %d\n", numOutputTensors); + + // using the same attributes to get num_output_task_statistics_tensors + attribute.numOutputTensors = 0; + + err = cudlaModuleGetAttributes(moduleHandle, CUDLA_NUM_OUTPUT_TASK_STATISTICS, &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting numOutputTensors = %d\n", err); + cleanUp(&resourceList); + return 1; + } + numOutputTaskStatistics = attribute.numOutputTensors; + DPRINTF("numOutputTaskStatistics = %d\n", numOutputTaskStatistics); + + if(numOutputTaskStatistics == 0) { + DPRINTF("Layerwise stats is not supported for this Loadable \n"); + cleanUp(&resourceList); + return 1; + } + + resourceList.numInputTensors = numInputTensors; + resourceList.numOutputTensors = numOutputTensors; + resourceList.numOutputTaskStatistics = numOutputTaskStatistics; + + cudlaModuleTensorDescriptor* inputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numInputTensors); + cudlaModuleTensorDescriptor* outputTensorDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTensors); + + if ((inputTensorDesc == NULL) || (outputTensorDesc == NULL)) { + if (inputTensorDesc != NULL) + { + free(inputTensorDesc); + inputTensorDesc = NULL; + } + + if (outputTensorDesc != NULL) + { + free(outputTensorDesc); + outputTensorDesc = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputTensorDesc = inputTensorDesc; + resourceList.outputTensorDesc = outputTensorDesc; + + cudlaModuleTensorDescriptor* outputTaskStatisticsDesc = + (cudlaModuleTensorDescriptor*)malloc(sizeof(cudlaModuleTensorDescriptor)*numOutputTaskStatistics); + if (outputTaskStatisticsDesc == NULL) { + free(outputTaskStatisticsDesc); + outputTaskStatisticsDesc = NULL; + cleanUp(&resourceList); + return 1; + } + + resourceList.outputTaskStatisticsDesc = outputTaskStatisticsDesc; + + attribute.inputTensorDesc = inputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_INPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting input tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing input tensor descriptor\n"); + printTensorDesc(inputTensorDesc); + + attribute.outputTensorDesc = outputTensorDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TENSOR_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting output tensor descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output tensor descriptor\n"); + printTensorDesc(outputTensorDesc); + + attribute.outputTensorDesc = outputTaskStatisticsDesc; + err = cudlaModuleGetAttributes(moduleHandle, + CUDLA_OUTPUT_TASK_STATISTICS_DESCRIPTORS, + &attribute); + if (err != cudlaSuccess) { + DPRINTF("Error in getting task statistics descriptor = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("Printing output task statistics descriptor size\n"); + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + DPRINTF("The size of %u descriptor is %lu\n", ii,outputTaskStatisticsDesc[ii].size); + } + + // Setup the input and output buffers. + unsigned char** inputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numInputTensors); + if (inputBuffer == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer, 0x00, sizeof(unsigned char *)*numInputTensors); + resourceList.inputBuffer = inputBuffer; + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + inputBuffer[ii] = (unsigned char* )malloc(inputTensorDesc[ii].size); + if (inputBuffer[ii] == NULL) { + DPRINTF("Error in allocating input memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputBuffer[ii], 0x01, inputTensorDesc[ii].size); + } + + unsigned char** outputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTensors); + if (outputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer, 0x00, sizeof(unsigned char *)*numOutputTensors); + resourceList.outputBuffer = outputBuffer; + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + outputBuffer[ii] = (unsigned char* )malloc(outputTensorDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBuffer[ii], 0x00, outputTensorDesc[ii].size); + } + + unsigned char** statisticsOutputBuffer = (unsigned char **)malloc(sizeof(unsigned char *)*numOutputTaskStatistics); + if (statisticsOutputBuffer == NULL) { + DPRINTF("Error in allocating memory for output buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer, 0x00, sizeof(unsigned char *)*numOutputTaskStatistics); + resourceList.statisticsOutputBuffer = statisticsOutputBuffer; + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + statisticsOutputBuffer[ii] = (unsigned char* )malloc(outputTaskStatisticsDesc[ii].size); + if (outputBuffer[ii] == NULL) { + DPRINTF("Error in allocating output memory\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputBuffer[ii], 0x00, outputTaskStatisticsDesc[ii].size); + } + + NvSciBufModule bufModule = NULL; + NvSciBufAttrList *inputAttrList = {NULL}; + NvSciBufAttrList *outputAttrList = {NULL}; + NvSciBufAttrList *statisticsOutputAttrList = {NULL}; + NvSciBufAttrList *reconciledInputAttrList = {NULL}; + NvSciBufAttrList *reconciledOutputAttrList = {NULL}; + NvSciBufAttrList *reconciledStatisticsOutputAttrList = {NULL}; + NvSciBufAttrList *inputConflictList = {NULL}; + NvSciBufAttrList *outputConflictList = {NULL}; + NvSciBufAttrList *statisticsOutputConflictList = {NULL}; + NvSciError sciError = NvSciError_Success; + + sciError = NvSciBufModuleOpen(&bufModule); + if (sciError != NvSciError_Success) { + DPRINTF("Error in initializing NvSciBufModule\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.bufModule = bufModule; + + // creating and setting input attribute list + + inputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + if (inputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + resourceList.inputAttrList = inputAttrList; + + reconciledInputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + if (reconciledInputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(reconciledInputAttrList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + resourceList.reconciledInputAttrList = reconciledInputAttrList; + + inputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numInputTensors); + if (inputConflictList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(inputConflictList, 0x00, sizeof(NvSciBufAttrList)*numInputTensors); + resourceList.inputConflictList = inputConflictList; + + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = createAndSetAttrList(bufModule, + inputTensorDesc[ii].size, + &inputAttrList[ii]); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list for input attribute\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufAttrListReconcile(&inputAttrList[ii], + 1, + &reconciledInputAttrList[ii], + &inputConflictList[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list for input attribute\n"); + cleanUp(&resourceList); + return 1; + } + + } + + outputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + if (outputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + resourceList.outputAttrList = outputAttrList; + + reconciledOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + if (reconciledOutputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(reconciledOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + resourceList.reconciledOutputAttrList = reconciledOutputAttrList; + + outputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTensors); + if (outputConflictList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTensors); + resourceList.outputConflictList = outputConflictList; + + // creating and setting output attribute list + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = createAndSetAttrList(bufModule, + outputTensorDesc[ii].size, + &outputAttrList[ii]); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list for output attibute\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufAttrListReconcile(&outputAttrList[ii], + 1, + &reconciledOutputAttrList[ii], + &outputConflictList[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list for output attribute\n"); + cleanUp(&resourceList); + return 1; + } + } + + statisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + if (statisticsOutputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + resourceList.statisticsOutputAttrList = statisticsOutputAttrList; + + reconciledStatisticsOutputAttrList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + if (reconciledStatisticsOutputAttrList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(reconciledStatisticsOutputAttrList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + resourceList.reconciledStatisticsOutputAttrList = reconciledStatisticsOutputAttrList; + + statisticsOutputConflictList = (NvSciBufAttrList *)malloc(sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + if (statisticsOutputConflictList == NULL) { + DPRINTF("Error in allocating memory for input buffer array\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsOutputConflictList, 0x00, sizeof(NvSciBufAttrList)*numOutputTaskStatistics); + resourceList.statisticsOutputConflictList = statisticsOutputConflictList; + + // creating and setting statistics output attribute list + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = createAndSetAttrList(bufModule, + outputTaskStatisticsDesc[ii].size, + &statisticsOutputAttrList[ii]); + if (err != cudlaSuccess) { + DPRINTF("Error in creating NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufAttrListReconcile(&statisticsOutputAttrList[ii], + 1, + &reconciledStatisticsOutputAttrList[ii], + &statisticsOutputConflictList[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciBuf attribute list\n"); + cleanUp(&resourceList); + return 1; + } + } + + NvSciBufObj *inputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numInputTensors); + NvSciBufObj *outputBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTensors); + NvSciBufObj *statisticsBufObj = (NvSciBufObj *)malloc(sizeof(NvSciBufObj)*numOutputTaskStatistics); + + resourceList.inputBufObj = inputBufObj; + resourceList.outputBufObj = outputBufObj; + resourceList.statisticsBufObj = statisticsBufObj; + + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + sciError = NvSciBufObjAlloc(reconciledInputAttrList[ii], &inputBufObj[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + sciError = NvSciBufObjAlloc(reconciledOutputAttrList[ii], &outputBufObj[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + sciError = NvSciBufObjAlloc(reconciledStatisticsOutputAttrList[ii], &statisticsBufObj[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciBuf object\n"); + cleanUp(&resourceList); + return 1; + } + } + + uint64_t** inputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numInputTensors); + uint64_t** outputBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTensors); + uint64_t** statisticsBufObjRegPtr = (uint64_t **)malloc(sizeof(uint64_t*)*numOutputTaskStatistics); + + if ((inputBufObjRegPtr == NULL) || (outputBufObjRegPtr == NULL) || (statisticsBufObjRegPtr == NULL)) { + if (inputBufObjRegPtr != NULL) { + free(inputBufObjRegPtr); + inputBufObjRegPtr = NULL; + } + + if (outputBufObjRegPtr != NULL) { + free(outputBufObjRegPtr); + outputBufObjRegPtr = NULL; + } + + if (statisticsBufObjRegPtr != NULL) { + free(statisticsBufObjRegPtr); + statisticsBufObjRegPtr = NULL; + } + + cleanUp(&resourceList); + return 1; + } + + resourceList.inputBufObjRegPtr = inputBufObjRegPtr; + resourceList.outputBufObjRegPtr = outputBufObjRegPtr; + resourceList.statisticsBufObjRegPtr = statisticsBufObjRegPtr; + + void **inputBufObjBuffer = (void **)malloc(sizeof(void*)*numInputTensors); + void **outputBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTensors); + void **statisticsBufObjBuffer = (void **)malloc(sizeof(void*)*numOutputTaskStatistics); + + cudlaExternalMemoryHandleDesc memDesc = { 0 }; + // importing external memory + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)inputBufObj[ii]; + memDesc.size = inputTensorDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &inputBufObjRegPtr[ii], 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(inputBufObj[ii], &inputBufObjBuffer[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memcpy(inputBufObjBuffer[ii], inputBuffer[ii], inputTensorDesc[ii].size); + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)outputBufObj[ii]; + memDesc.size = outputTensorDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &outputBufObjRegPtr[ii], 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(outputBufObj[ii], &outputBufObjBuffer[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memset(outputBufObjBuffer[ii], 0, outputTensorDesc[ii].size); + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + memset(&memDesc, 0, sizeof(memDesc)); + memDesc.extBufObject = (void *)statisticsBufObj[ii]; + memDesc.size = outputTaskStatisticsDesc[ii].size; + err = cudlaImportExternalMemory(devHandle, &memDesc, &statisticsBufObjRegPtr[ii], CUDLA_TASK_STATISTICS); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + sciError = NvSciBufObjGetCpuPtr(statisticsBufObj[ii], &statisticsBufObjBuffer[ii]); + if (sciError != NvSciError_Success) { + DPRINTF("Error in getting NvSciBuf CPU pointer\n"); + cleanUp(&resourceList); + return 1; + } + memset(statisticsBufObjBuffer[ii], 0, outputTaskStatisticsDesc[ii].size); + } + + uint64_t *outputStatisticsBufferRegisteredPtr[numOutputTensors + numOutputTaskStatistics] = {0} ; + + uint32_t index = 0; + for (; index < numOutputTensors ; index++) { + outputStatisticsBufferRegisteredPtr[index] = ((outputBufObjRegPtr[index])); + } + + for (uint32_t jj=0; jj < numOutputTaskStatistics ; jj++) { + outputStatisticsBufferRegisteredPtr[index++] = ((statisticsBufObjRegPtr[jj])); + } + + NvSciSyncObj syncObj; + NvSciSyncModule syncModule; + NvSciSyncAttrList syncAttrListObj[2]; + NvSciSyncCpuWaitContext nvSciCtx; + NvSciSyncAttrList waiterAttrListObj = NULL; + NvSciSyncAttrList signalerAttrListObj = NULL; + NvSciSyncAttrList nvSciSyncConflictListObj; + NvSciSyncAttrList nvSciSyncReconciledListObj; + + sciError = NvSciSyncModuleOpen(&syncModule); + if (sciError != NvSciError_Success) { + DPRINTF("Error in initializing NvSciSyncModuleOpen\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncModule = syncModule; + + sciError = NvSciSyncCpuWaitContextAlloc(syncModule, &nvSciCtx); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating cpu wait context NvSciSyncCpuWaitContextAlloc\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciCtx = nvSciCtx; + + sciError = NvSciSyncAttrListCreate(syncModule, &signalerAttrListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.signalerAttrListObj = signalerAttrListObj; + + sciError = NvSciSyncAttrListCreate(syncModule, &waiterAttrListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in creating NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.waiterAttrListObj = waiterAttrListObj; + + err = cudlaGetNvSciSyncAttributes(reinterpret_cast(signalerAttrListObj), + CUDLA_NVSCISYNC_ATTR_SIGNAL); + if (err != cudlaSuccess) { + DPRINTF("Error in getting cuDLA's NvSciSync attributes\n"); + cleanUp(&resourceList); + return 1; + } + + sciError = fillCpuWaiterAttrList(waiterAttrListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in setting NvSciSync attribute list\n"); + cleanUp(&resourceList); + return 1; + } + + syncAttrListObj[0] = signalerAttrListObj; + syncAttrListObj[1] = waiterAttrListObj; + sciError = NvSciSyncAttrListReconcile(syncAttrListObj, + 2, + &nvSciSyncReconciledListObj, + &nvSciSyncConflictListObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in reconciling NvSciSync's attribute lists\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.nvSciSyncConflictListObj = nvSciSyncConflictListObj; + resourceList.nvSciSyncReconciledListObj = nvSciSyncReconciledListObj; + + sciError = NvSciSyncObjAlloc(nvSciSyncReconciledListObj, &syncObj); + if (sciError != NvSciError_Success) { + DPRINTF("Error in allocating NvSciSync object\n"); + cleanUp(&resourceList); + return 1; + } + resourceList.syncObj = syncObj; + + // importing external semaphore + uint64_t* nvSciSyncObjRegPtr = NULL; + cudlaExternalSemaphoreHandleDesc semaMemDesc = { 0 }; + memset(&semaMemDesc, 0, sizeof(semaMemDesc)); + semaMemDesc.extSyncObject = syncObj; + err = cudlaImportExternalSemaphore(devHandle, + &semaMemDesc, + &nvSciSyncObjRegPtr, + 0); + if (err != cudlaSuccess) { + DPRINTF("Error in importing external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + DPRINTF("ALL MEMORY REGISTERED SUCCESSFULLY\n"); + + // Signal Events + cudlaSignalEvents* signalEvents; + signalEvents = (cudlaSignalEvents *)malloc(sizeof(cudlaSignalEvents)); + if (signalEvents == NULL) { + DPRINTF("Error in allocating signal events\n"); + cleanUp(&resourceList); + return 1; + } + + signalEvents->numEvents = 1; + uint64_t** devPtrs = (uint64_t **)malloc(signalEvents->numEvents * + sizeof(uint64_t *)); + if (devPtrs == NULL) { + DPRINTF("Error in allocating output pointer's array of registered objects\n"); + cleanUp(&resourceList); + return 1; + } + devPtrs[0] = nvSciSyncObjRegPtr; + signalEvents->devPtrs = devPtrs; + resourceList.devPtrs = devPtrs; + + signalEvents->eofFences = (CudlaFence *)malloc(signalEvents->numEvents * + sizeof(CudlaFence)); + if (signalEvents->eofFences == NULL) { + DPRINTF("Error in allocating eofFence array\n"); + cleanUp(&resourceList); + return 1; + } + + NvSciSyncFence eofFence = NvSciSyncFenceInitializer; + signalEvents->eofFences[0].fence = &eofFence; + signalEvents->eofFences[0].type = CUDLA_NVSCISYNC_FENCE; + resourceList.signalEvents = signalEvents; + resourceList.eofFence = eofFence; + + // Enqueue a cuDLA task. + cudlaTask task; + task.moduleHandle = moduleHandle; + task.outputTensor = (uint64_t * const*)&outputStatisticsBufferRegisteredPtr; + + if(statSupport == 1) { + task.numOutputTensors = (numOutputTensors + numOutputTaskStatistics); + DPRINTF("Layerwise profiling is requested \n"); + } else { + task.numOutputTensors = numOutputTensors; + DPRINTF("Layerwise profiling is not requested \n"); + } + + task.numInputTensors = numInputTensors; + task.inputTensor = inputBufObjRegPtr; + task.waitEvents = NULL; + task.signalEvents = signalEvents; + + err = cudlaSubmitTask(devHandle, &task, 1, NULL, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in submitting task\n"); + cleanUp(&resourceList); + return 1; + } + DPRINTF("SUBMIT IS DONE !!!\n"); + + // Wait for operations to finish and bring output buffer to CPU. + sciError = NvSciSyncFenceWait(reinterpret_cast(signalEvents->eofFences[0].fence), + nvSciCtx, -1); + if (sciError != NvSciError_Success) { + DPRINTF("Error in waiting on NvSciSyncFence\n"); + cleanUp(&resourceList); + return 1; + } + + // copy statistics data to cpu + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + memcpy(outputBuffer[ii], outputBufObjBuffer[ii], outputTensorDesc[ii].size); + } + + if(statSupport == 1) { + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + memcpy(statisticsOutputBuffer[ii], statisticsBufObjBuffer[ii], outputTaskStatisticsDesc[ii].size); + } + + const cudlaExternalEtbl* etbl = NULL; + if (cudlaGetExternalExportTable(&etbl,0) != cudlaSuccess) { + DPRINTF("Error in getting export table\n"); + cleanUp(&resourceList); + return 1; + } + + void** csv = (void **)malloc(sizeof(void *)*numOutputTaskStatistics); + if (csv == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv, 0x00, sizeof(void *)*numOutputTaskStatistics); + resourceList.csv = csv; + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + cudlaTranslateCsvAttribute csvAttribute; + uint64_t csvStreamLength = 0; + + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_LENGTH,&csvAttribute); + csv[ii] = (void* )malloc(csvAttribute.csvStreamLength); + csvStreamLength = csvAttribute.csvStreamLength; + DPRINTF("size for statistics buffer %u is %lu \n",ii,csvStreamLength); + + if (csv[ii] == NULL) { + DPRINTF("Error in allocating memory for csv stream\n"); + cleanUp(&resourceList); + return 1; + } + memset(csv[ii], 0x00, csvAttribute.csvStreamLength); + + csvAttribute.csvStreamStats = csv[ii]; + err = etbl->etiTranslateStats(devHandle,statisticsOutputBuffer[ii],dlaFreqInMHz,ii,CUDLA_GET_CSV_STATS,&csvAttribute); + if (err != cudlaSuccess) { + DPRINTF("Error in translating stats\n"); + cleanUp(&resourceList); + return 1; + } + + if (argc == 5) { + sprintf(filename,"%s%u%s", argv[4],(ii+1),suffix); + fp = fopen(filename, "w+"); + if (fp == NULL) { + DPRINTF("Cannot open file %s\n", filename); + cleanUp(&resourceList); + return 1; + } + + uint32_t ret_val = fwrite(csv[ii],sizeof(char),csvStreamLength,fp); + if(ret_val != csvStreamLength) { + DPRINTF("number of elements written to file is %u \n", ret_val); + cleanUp(&resourceList); + return 1; + } + + fclose(fp); + } else { + DPRINTF("%s \n",(char *)csv[ii]); + } + } + } + + // unregister the CUDA-allocated buffers. + for (uint32_t ii = 0; ii < numInputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (inputBufObjRegPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering input memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTensors; ii++) { + err = cudlaMemUnregister(devHandle, + (outputBufObjRegPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + for (uint32_t ii = 0; ii < numOutputTaskStatistics; ii++) { + err = cudlaMemUnregister(devHandle, + (statisticsBufObjRegPtr[ii])); + if (err != cudlaSuccess) { + DPRINTF("Error in registering output memory = %d\n", err); + cleanUp(&resourceList); + return 1; + } + } + + err = cudlaMemUnregister(devHandle, nvSciSyncObjRegPtr); + if (err != cudlaSuccess) { + DPRINTF("Error in unregistering external semaphore = %d\n", err); + cleanUp(&resourceList); + return 1; + } + + DPRINTF("ALL MEMORY UNREGISTERED SUCCESSFULLY\n"); + + + err = cudlaModuleUnload(moduleHandle, 0); + if (err != cudlaSuccess) { + DPRINTF("Error in cudlaModuleUnload = %d\n", err); + cleanUp(&resourceList); + return 1; + } else { + DPRINTF("Successfully unloaded module\n"); + } + + resourceList.moduleHandle = NULL; + + err = cudlaDestroyDevice(devHandle); + if (err != cudlaSuccess) { + DPRINTF("Error in cuDLA destroy device = %d\n", err); + return 1; + } + DPRINTF("Device destroyed successfully\n"); + + resourceList.devHandle = NULL; + + cleanUp(&resourceList); + + DPRINTF("cuDLALayerwiseStatsStandalone DONE !!!\n"); + + return 0; +} diff --git a/Samples/4_CUDA_Libraries/cuDLAStandaloneMode/README.md b/Samples/4_CUDA_Libraries/cuDLAStandaloneMode/README.md index 14c03834..d2adec61 100644 --- a/Samples/4_CUDA_Libraries/cuDLAStandaloneMode/README.md +++ b/Samples/4_CUDA_Libraries/cuDLAStandaloneMode/README.md @@ -27,7 +27,7 @@ aarch64 ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/README.md b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/README.md index 8f915355..c701dceb 100644 --- a/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/README.md +++ b/Samples/4_CUDA_Libraries/cuSolverDn_LinearSolver/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaStreamDestroy, cudaFree, cudaDeviceSynchronize, cudaMemset, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuSolverRf/README.md b/Samples/4_CUDA_Libraries/cuSolverRf/README.md index 376f480e..be648ded 100644 --- a/Samples/4_CUDA_Libraries/cuSolverRf/README.md +++ b/Samples/4_CUDA_Libraries/cuSolverRf/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaStreamDestroy, cudaFree, cudaDeviceSynchronize, cudaMalloc, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/README.md b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/README.md index 8d14fb50..b698b87e 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/README.md +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LinearSolver/README.md @@ -33,7 +33,7 @@ cudaStreamDestroy, cudaFree, cudaDeviceSynchronize, cudaMalloc, cudaStreamCreate ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/README.md b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/README.md index 774f403a..9843b27b 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/README.md +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelCholesky/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaStreamDestroy, cudaFree, cudaMalloc, cudaStreamCreate ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/README.md b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/README.md index b639e77c..260d0016 100644 --- a/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/README.md +++ b/Samples/4_CUDA_Libraries/cuSolverSp_LowlevelQR/README.md @@ -33,7 +33,7 @@ cudaMemcpy, cudaStreamDestroy, cudaFree, cudaMalloc, cudaStreamCreate ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cudaNvSci/README.md b/Samples/4_CUDA_Libraries/cudaNvSci/README.md index 81a1faf4..b8216b15 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSci/README.md +++ b/Samples/4_CUDA_Libraries/cudaNvSci/README.md @@ -33,7 +33,7 @@ cudaExternalMemoryGetMappedBuffer, cudaImportExternalSemaphore, cudaDeviceGetAtt ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/cudaNvSciNvMedia/README.md b/Samples/4_CUDA_Libraries/cudaNvSciNvMedia/README.md index 4dde5eb5..7d82845a 100644 --- a/Samples/4_CUDA_Libraries/cudaNvSciNvMedia/README.md +++ b/Samples/4_CUDA_Libraries/cudaNvSciNvMedia/README.md @@ -33,7 +33,7 @@ cudaImportExternalSemaphore, cudaGetMipmappedArrayLevel, cudaSetDevice, cudaDest ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/freeImageInteropNPP/README.md b/Samples/4_CUDA_Libraries/freeImageInteropNPP/README.md index 7f457c8c..eeead3ef 100644 --- a/Samples/4_CUDA_Libraries/freeImageInteropNPP/README.md +++ b/Samples/4_CUDA_Libraries/freeImageInteropNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaSetDevice, cudaGetDeviceCount, cudaDeviceInit, cudaDr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/histEqualizationNPP/README.md b/Samples/4_CUDA_Libraries/histEqualizationNPP/README.md index edb5ab67..6092525e 100644 --- a/Samples/4_CUDA_Libraries/histEqualizationNPP/README.md +++ b/Samples/4_CUDA_Libraries/histEqualizationNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaMemcpy, cudaFree, cudaSetDevice, cudaGetDeviceCount, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/jitLto/Makefile b/Samples/4_CUDA_Libraries/jitLto/Makefile index b8a86ccc..552eb2e8 100644 --- a/Samples/4_CUDA_Libraries/jitLto/Makefile +++ b/Samples/4_CUDA_Libraries/jitLto/Makefile @@ -388,10 +388,10 @@ else @echo "Sample is ready - all dependencies have been met" endif -jitlto.o:jitlto.cpp +jitLto.o:jitLto.cpp $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $< -jitLto: jitlto.o +jitLto: jitLto.o $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE) @@ -402,7 +402,7 @@ run: build testrun: build clean: - rm -f jitLto jitlto.o + rm -f jitLto jitLto.o rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/jitLto clobber: clean diff --git a/Samples/4_CUDA_Libraries/jitLto/README.md b/Samples/4_CUDA_Libraries/jitLto/README.md index e3945ad2..d37a1e3e 100644 --- a/Samples/4_CUDA_Libraries/jitLto/README.md +++ b/Samples/4_CUDA_Libraries/jitLto/README.md @@ -30,7 +30,7 @@ cuModuleLoad, cuModuleLoadDataEx, cuModuleGetFunction, cuMemAlloc, cuMemFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp b/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp index 6300b35b..7b6ae4a8 100644 --- a/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp +++ b/Samples/4_CUDA_Libraries/jitLto/jitLto.cpp @@ -131,6 +131,16 @@ static void getLTOIR (const char *code, const char *name, int main(int argc, char *argv[]) { + unsigned int cuda_major = 0; + unsigned int cuda_minor = 0; + nvJitLinkResult res = nvJitLinkVersion(&cuda_major, &cuda_minor); + if (res != NVJITLINK_SUCCESS) { + std::cerr << "Version check failed" << '\n'; + } else { + std::cout << "CUDA " << cuda_major << "." << cuda_minor << '\n'; + } + + char *ltoIR1; char *ltoIR2; size_t ltoIR1Size; diff --git a/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj b/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj index 9d789427..ed787b46 100644 --- a/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj +++ b/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2017.vcxproj @@ -102,7 +102,7 @@ - + diff --git a/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj b/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj index 8fd78ec9..a8dedf58 100644 --- a/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj +++ b/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2019.vcxproj @@ -98,7 +98,7 @@ - + diff --git a/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj b/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj index 61b1d40c..93c0fa9e 100644 --- a/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj +++ b/Samples/4_CUDA_Libraries/jitLto/jitLto_vs2022.vcxproj @@ -98,7 +98,7 @@ - + diff --git a/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp b/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp deleted file mode 100644 index 7b6ae4a8..00000000 --- a/Samples/4_CUDA_Libraries/jitLto/jitlto.cpp +++ /dev/null @@ -1,261 +0,0 @@ -/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of NVIDIA CORPORATION nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include - -#define NUM_THREADS 128 -#define NUM_BLOCKS 32 - -#define NVRTC_SAFE_CALL(x) \ - do { \ - nvrtcResult result = x; \ - if (result != NVRTC_SUCCESS) { \ - std::cerr << "\nerror: " #x " failed with error " \ - << nvrtcGetErrorString(result) << '\n'; \ - exit(1); \ - } \ - } while(0) -#define CUDA_SAFE_CALL(x) \ - do { \ - CUresult result = x; \ - if (result != CUDA_SUCCESS) { \ - const char *msg; \ - cuGetErrorName(result, &msg); \ - std::cerr << "\nerror: " #x " failed with error " \ - << msg << '\n'; \ - exit(1); \ - } \ - } while(0) -#define NVJITLINK_SAFE_CALL(h,x) \ - do { \ - nvJitLinkResult result = x; \ - if (result != NVJITLINK_SUCCESS) { \ - std::cerr << "\nerror: " #x " failed with error " \ - << result << '\n'; \ - size_t lsize; \ - result = nvJitLinkGetErrorLogSize(h, &lsize); \ - if (result == NVJITLINK_SUCCESS && lsize > 0) { \ - char *log = (char*)malloc(lsize); \ - result = nvJitLinkGetErrorLog(h, log); \ - if (result == NVJITLINK_SUCCESS) { \ - std::cerr << "error log: " << log << '\n'; \ - free(log); \ - } \ - } \ - exit(1); \ - } \ - } while(0) - -const char *lto_saxpy = " \n\ -extern __device__ float compute(float a, float x, float y); \n\ - \n\ -extern \"C\" __global__ \n\ -void saxpy(float a, float *x, float *y, float *out, size_t n) \n\ -{ \n\ - size_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n\ - if (tid < n) { \n\ - out[tid] = compute(a, x[tid], y[tid]); \n\ - } \n\ -} \n"; - -const char *lto_compute = " \n\ -__device__ float compute(float a, float x, float y) { \n\ - return a * x + y; \n\ -} \n"; - -// compile code into LTOIR, returning the IR and its size -static void getLTOIR (const char *code, const char *name, - char **ltoIR, size_t *ltoIRSize) -{ - // Create an instance of nvrtcProgram with the code string. - nvrtcProgram prog; - NVRTC_SAFE_CALL( - nvrtcCreateProgram(&prog, // prog - code, // buffer - name, // name - 0, // numHeaders - NULL, // headers - NULL)); // includeNames - - // specify that LTO IR should be generated for LTO operation - const char *opts[] = {"-dlto", - "--relocatable-device-code=true"}; - nvrtcResult compileResult = nvrtcCompileProgram(prog, // prog - 2, // numOptions - opts); // options - // Obtain compilation log from the program. - size_t logSize; - NVRTC_SAFE_CALL(nvrtcGetProgramLogSize(prog, &logSize)); - char *log = new char[logSize]; - NVRTC_SAFE_CALL(nvrtcGetProgramLog(prog, log)); - std::cout << log << '\n'; - delete[] log; - if (compileResult != NVRTC_SUCCESS) { - exit(1); - } - // Obtain generated LTO IR from the program. - NVRTC_SAFE_CALL(nvrtcGetLTOIRSize(prog, ltoIRSize)); - *ltoIR = new char[*ltoIRSize]; - NVRTC_SAFE_CALL(nvrtcGetLTOIR(prog, *ltoIR)); - // Destroy the program. - NVRTC_SAFE_CALL(nvrtcDestroyProgram(&prog)); -} - -int main(int argc, char *argv[]) -{ - unsigned int cuda_major = 0; - unsigned int cuda_minor = 0; - nvJitLinkResult res = nvJitLinkVersion(&cuda_major, &cuda_minor); - if (res != NVJITLINK_SUCCESS) { - std::cerr << "Version check failed" << '\n'; - } else { - std::cout << "CUDA " << cuda_major << "." << cuda_minor << '\n'; - } - - - char *ltoIR1; - char *ltoIR2; - size_t ltoIR1Size; - size_t ltoIR2Size; - // getLTOIR uses nvrtc to get the LTOIR. - // We could also use nvcc offline with -dlto -fatbin - // to generate the IR, but using nvrtc keeps the build simpler. - getLTOIR(lto_saxpy, "lto_saxpy.cu", <oIR1, <oIR1Size); - getLTOIR(lto_compute, "lto_compute.cu", <oIR2, <oIR2Size); - - CUdevice cuDevice; - CUcontext context; - CUmodule module; - CUfunction kernel; - CUDA_SAFE_CALL(cuInit(0)); - CUDA_SAFE_CALL(cuDeviceGet(&cuDevice, 0)); - CUDA_SAFE_CALL(cuCtxCreate(&context, 0, cuDevice)); - - // Dynamically determine the arch to link for - int major = 0; - int minor = 0; - CUDA_SAFE_CALL(cuDeviceGetAttribute(&major, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); - CUDA_SAFE_CALL(cuDeviceGetAttribute(&minor, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); - int arch = major*10 + minor; - char smbuf[16]; - memset(smbuf,0,16); - sprintf(smbuf, "-arch=sm_%d", arch); - - // Load the generated LTO IR and link them together - nvJitLinkHandle handle; - const char *lopts[] = {"-lto", smbuf}; - NVJITLINK_SAFE_CALL(handle, nvJitLinkCreate(&handle, 2, lopts)); - - NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, - (void *)ltoIR1, ltoIR1Size, "lto_saxpy")); - NVJITLINK_SAFE_CALL(handle, nvJitLinkAddData(handle, NVJITLINK_INPUT_LTOIR, - (void *)ltoIR2, ltoIR2Size, "lto_compute")); - - // The call to nvJitLinkComplete causes linker to link together the two - // LTO IR modules, do optimization on the linked LTO IR, - // and generate cubin from it. - NVJITLINK_SAFE_CALL(handle, nvJitLinkComplete(handle)); - - // check error log - size_t logSize; - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLogSize(handle, &logSize)); - if (logSize > 0) { - char *log = (char*)malloc(logSize+1); - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetErrorLog(handle, log)); - std::cout << "Error log: " << log << std::endl; - free(log); - } - - // get linked cubin - size_t cubinSize; - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubinSize(handle, &cubinSize)); - void *cubin = malloc(cubinSize); - NVJITLINK_SAFE_CALL(handle, nvJitLinkGetLinkedCubin(handle, cubin)); - - NVJITLINK_SAFE_CALL(handle, nvJitLinkDestroy(&handle)); - delete[] ltoIR1; - delete[] ltoIR2; - - // cubin is linked, so now load it - CUDA_SAFE_CALL(cuModuleLoadData(&module, cubin)); - CUDA_SAFE_CALL(cuModuleGetFunction(&kernel, module, "saxpy")); - - // Generate input for execution, and create output buffers. - size_t n = NUM_THREADS * NUM_BLOCKS; - size_t bufferSize = n * sizeof(float); - float a = 5.1f; - float *hX = new float[n], *hY = new float[n], *hOut = new float[n]; - for (size_t i = 0; i < n; ++i) { - hX[i] = static_cast(i); - hY[i] = static_cast(i * 2); - } - CUdeviceptr dX, dY, dOut; - CUDA_SAFE_CALL(cuMemAlloc(&dX, bufferSize)); - CUDA_SAFE_CALL(cuMemAlloc(&dY, bufferSize)); - CUDA_SAFE_CALL(cuMemAlloc(&dOut, bufferSize)); - CUDA_SAFE_CALL(cuMemcpyHtoD(dX, hX, bufferSize)); - CUDA_SAFE_CALL(cuMemcpyHtoD(dY, hY, bufferSize)); - // Execute SAXPY. - void *args[] = { &a, &dX, &dY, &dOut, &n }; - CUDA_SAFE_CALL( - cuLaunchKernel(kernel, - NUM_BLOCKS, 1, 1, // grid dim - NUM_THREADS, 1, 1, // block dim - 0, NULL, // shared mem and stream - args, 0)); // arguments - CUDA_SAFE_CALL(cuCtxSynchronize()); - // Retrieve and print output. - CUDA_SAFE_CALL(cuMemcpyDtoH(hOut, dOut, bufferSize)); - - for (size_t i = 0; i < n; ++i) { - std::cout << a << " * " << hX[i] << " + " << hY[i] - << " = " << hOut[i] << '\n'; - } - // check last value to verify - if (hOut[n-1] == 29074.5) { - std::cout << "PASSED!\n"; - } else { - std::cout << "values not expected?\n"; - } - // Release resources. - CUDA_SAFE_CALL(cuMemFree(dX)); - CUDA_SAFE_CALL(cuMemFree(dY)); - CUDA_SAFE_CALL(cuMemFree(dOut)); - CUDA_SAFE_CALL(cuModuleUnload(module)); - CUDA_SAFE_CALL(cuCtxDestroy(context)); - free(cubin); - delete[] hX; - delete[] hY; - delete[] hOut; - return 0; -} diff --git a/Samples/4_CUDA_Libraries/lineOfSight/README.md b/Samples/4_CUDA_Libraries/lineOfSight/README.md index d33d5f0b..a6d0bb6d 100644 --- a/Samples/4_CUDA_Libraries/lineOfSight/README.md +++ b/Samples/4_CUDA_Libraries/lineOfSight/README.md @@ -27,7 +27,7 @@ cudaCreateChannelDesc, cudaMallocArray, cudaFreeArray, cudaDeviceSynchronize, cu ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/README.md b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/README.md index d28bc127..b0500b96 100644 --- a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/README.md +++ b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaEventSynchronize, cudaEventRecord, cudaMalloc, cudaEve ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/nvJPEG/README.md b/Samples/4_CUDA_Libraries/nvJPEG/README.md index 967febe7..4797ddb8 100644 --- a/Samples/4_CUDA_Libraries/nvJPEG/README.md +++ b/Samples/4_CUDA_Libraries/nvJPEG/README.md @@ -28,7 +28,7 @@ cudaHostAlloc, cudaStreamCreateWithFlags, cudaStreamDestroy, cudaFree, cudaEvent ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/nvJPEG_encoder/README.md b/Samples/4_CUDA_Libraries/nvJPEG_encoder/README.md index fb6fa437..7e5f93fc 100644 --- a/Samples/4_CUDA_Libraries/nvJPEG_encoder/README.md +++ b/Samples/4_CUDA_Libraries/nvJPEG_encoder/README.md @@ -28,7 +28,7 @@ cudaFree, cudaGetErrorString, cudaEventSynchronize, cudaDeviceSynchronize, cudaE ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/oceanFFT/README.md b/Samples/4_CUDA_Libraries/oceanFFT/README.md index 19b898ee..ea2c0f06 100644 --- a/Samples/4_CUDA_Libraries/oceanFFT/README.md +++ b/Samples/4_CUDA_Libraries/oceanFFT/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMalloc, cudaFree, cudaGraphicsResour ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/randomFog/README.md b/Samples/4_CUDA_Libraries/randomFog/README.md index ed5db2f7..259f27f5 100644 --- a/Samples/4_CUDA_Libraries/randomFog/README.md +++ b/Samples/4_CUDA_Libraries/randomFog/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaGetErrorString, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUBLAS/README.md b/Samples/4_CUDA_Libraries/simpleCUBLAS/README.md index 7a2c6b1c..874cd07f 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLAS/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUBLAS/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUBLASXT/README.md b/Samples/4_CUDA_Libraries/simpleCUBLASXT/README.md index 5879c6ca..b670a706 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLASXT/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUBLASXT/README.md @@ -30,7 +30,7 @@ cudaGetDeviceProperties, cudaGetDeviceCount, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/README.md b/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/README.md index 065e275d..103a9335 100644 --- a/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUBLAS_LU/README.md @@ -30,7 +30,7 @@ cudaGetErrorEnum, cudaMalloc, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT/README.md b/Samples/4_CUDA_Libraries/simpleCUFFT/README.md index 5ef5cc12..ea9cab85 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUFFT/README.md @@ -30,7 +30,7 @@ cudaMalloc, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/README.md b/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/README.md index 05aa7827..d16fd49a 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUFFT_2d_MGPU/README.md @@ -30,7 +30,7 @@ cudaXtFree, cudaMemcpy, cudaFree, cudaSetDevice, cudaGetDeviceCount, cudaDeviceS ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/README.md b/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/README.md index dfe484b4..283d54a3 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUFFT_MGPU/README.md @@ -30,7 +30,7 @@ cudaXtFree, cudaSetDevice, cudaGetDeviceCount, cudaDeviceSynchronize, cudaGetDev ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/simpleCUFFT_callback/README.md b/Samples/4_CUDA_Libraries/simpleCUFFT_callback/README.md index ccd39361..2d655764 100644 --- a/Samples/4_CUDA_Libraries/simpleCUFFT_callback/README.md +++ b/Samples/4_CUDA_Libraries/simpleCUFFT_callback/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaMemcpyFromSymbol, cudaGetDevice, cudaMalloc, cudaGetDe ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/4_CUDA_Libraries/watershedSegmentationNPP/README.md b/Samples/4_CUDA_Libraries/watershedSegmentationNPP/README.md index ae433939..81a3f141 100644 --- a/Samples/4_CUDA_Libraries/watershedSegmentationNPP/README.md +++ b/Samples/4_CUDA_Libraries/watershedSegmentationNPP/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaFree, cudaDeviceGetAttribute, cudaDriverGetVersion, c ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/BlackScholes/README.md b/Samples/5_Domain_Specific/BlackScholes/README.md index cdd877d2..045712be 100644 --- a/Samples/5_Domain_Specific/BlackScholes/README.md +++ b/Samples/5_Domain_Specific/BlackScholes/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/BlackScholes_nvrtc/README.md b/Samples/5_Domain_Specific/BlackScholes_nvrtc/README.md index a196e917..4100b2dd 100644 --- a/Samples/5_Domain_Specific/BlackScholes_nvrtc/README.md +++ b/Samples/5_Domain_Specific/BlackScholes_nvrtc/README.md @@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/FDTD3d/README.md b/Samples/5_Domain_Specific/FDTD3d/README.md index 6af21d46..2b447d95 100644 --- a/Samples/5_Domain_Specific/FDTD3d/README.md +++ b/Samples/5_Domain_Specific/FDTD3d/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaMalloc, cudaFree, cudaFuncGetAttributes, cudaSetDevice, cudaGetD ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/HSOpticalFlow/README.md b/Samples/5_Domain_Specific/HSOpticalFlow/README.md index 468368c7..3d52a0a9 100644 --- a/Samples/5_Domain_Specific/HSOpticalFlow/README.md +++ b/Samples/5_Domain_Specific/HSOpticalFlow/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaMemcpy, cudaMemset, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/Mandelbrot/README.md b/Samples/5_Domain_Specific/Mandelbrot/README.md index 47f6b620..f9d9508e 100644 --- a/Samples/5_Domain_Specific/Mandelbrot/README.md +++ b/Samples/5_Domain_Specific/Mandelbrot/README.md @@ -30,7 +30,7 @@ cudaGLUnmapBufferObject, cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaG ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/MonteCarloMultiGPU/README.md b/Samples/5_Domain_Specific/MonteCarloMultiGPU/README.md index e37fcad6..1e934551 100644 --- a/Samples/5_Domain_Specific/MonteCarloMultiGPU/README.md +++ b/Samples/5_Domain_Specific/MonteCarloMultiGPU/README.md @@ -30,7 +30,7 @@ cudaStreamDestroy, cudaMalloc, cudaFree, cudaMallocHost, cudaSetDevice, cudaEven ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/NV12toBGRandResize/README.md b/Samples/5_Domain_Specific/NV12toBGRandResize/README.md index 1e49cb79..c322b453 100644 --- a/Samples/5_Domain_Specific/NV12toBGRandResize/README.md +++ b/Samples/5_Domain_Specific/NV12toBGRandResize/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaStreamDestroy, cudaMalloc, cudaFree, cudaMallocManaged, cudaStre ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/SLID3D10Texture/README.md b/Samples/5_Domain_Specific/SLID3D10Texture/README.md index d29ac3c8..4e441451 100644 --- a/Samples/5_Domain_Specific/SLID3D10Texture/README.md +++ b/Samples/5_Domain_Specific/SLID3D10Texture/README.md @@ -33,7 +33,7 @@ cudaGraphicsUnmapResources, cudaMalloc, cudaMallocPitch, cudaGetErrorString, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/SobelFilter/README.md b/Samples/5_Domain_Specific/SobelFilter/README.md index 01243317..7987dc9d 100644 --- a/Samples/5_Domain_Specific/SobelFilter/README.md +++ b/Samples/5_Domain_Specific/SobelFilter/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/SobolQRNG/README.md b/Samples/5_Domain_Specific/SobolQRNG/README.md index 9b17e903..ae8ae93f 100644 --- a/Samples/5_Domain_Specific/SobolQRNG/README.md +++ b/Samples/5_Domain_Specific/SobolQRNG/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaDeviceSynchronize, cudaGetDevice, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/VFlockingD3D10/README.md b/Samples/5_Domain_Specific/VFlockingD3D10/README.md index a77d5cfe..9fca8b5e 100644 --- a/Samples/5_Domain_Specific/VFlockingD3D10/README.md +++ b/Samples/5_Domain_Specific/VFlockingD3D10/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGetErrorString, cudaGraphi ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/bicubicTexture/README.md b/Samples/5_Domain_Specific/bicubicTexture/README.md index f3335112..2bc2f65b 100644 --- a/Samples/5_Domain_Specific/bicubicTexture/README.md +++ b/Samples/5_Domain_Specific/bicubicTexture/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaCreateChannelDesc, cudaMallocArray, cudaFreeArra ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/bilateralFilter/README.md b/Samples/5_Domain_Specific/bilateralFilter/README.md index 23d7f4f9..7d005727 100644 --- a/Samples/5_Domain_Specific/bilateralFilter/README.md +++ b/Samples/5_Domain_Specific/bilateralFilter/README.md @@ -30,7 +30,7 @@ cudaRuntimeGetVersion, cudaGraphicsUnmapResources, cudaMallocPitch, cudaFree, cu ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/binomialOptions/README.md b/Samples/5_Domain_Specific/binomialOptions/README.md index 2a69b255..18a2b691 100644 --- a/Samples/5_Domain_Specific/binomialOptions/README.md +++ b/Samples/5_Domain_Specific/binomialOptions/README.md @@ -27,7 +27,7 @@ cudaDeviceSynchronize, cudaMemcpyToSymbol, cudaMemcpyFromSymbol ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/binomialOptions_nvrtc/README.md b/Samples/5_Domain_Specific/binomialOptions_nvrtc/README.md index 749a31f1..1e9b82db 100644 --- a/Samples/5_Domain_Specific/binomialOptions_nvrtc/README.md +++ b/Samples/5_Domain_Specific/binomialOptions_nvrtc/README.md @@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/convolutionFFT2D/README.md b/Samples/5_Domain_Specific/convolutionFFT2D/README.md index 498e586f..0358c7a4 100644 --- a/Samples/5_Domain_Specific/convolutionFFT2D/README.md +++ b/Samples/5_Domain_Specific/convolutionFFT2D/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaDestroyTextureObject, cudaDeviceSynchronize, cudaCreat ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/dwtHaar1D/README.md b/Samples/5_Domain_Specific/dwtHaar1D/README.md index 1462bf56..14ccd543 100644 --- a/Samples/5_Domain_Specific/dwtHaar1D/README.md +++ b/Samples/5_Domain_Specific/dwtHaar1D/README.md @@ -27,7 +27,7 @@ cudaMalloc, cudaMemcpy, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/dxtc/README.md b/Samples/5_Domain_Specific/dxtc/README.md index 871bf28b..427ee317 100644 --- a/Samples/5_Domain_Specific/dxtc/README.md +++ b/Samples/5_Domain_Specific/dxtc/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaGetDevice, cudaMalloc, cudaGetD ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/fastWalshTransform/README.md b/Samples/5_Domain_Specific/fastWalshTransform/README.md index 8bb9b6e7..eb72ad6d 100644 --- a/Samples/5_Domain_Specific/fastWalshTransform/README.md +++ b/Samples/5_Domain_Specific/fastWalshTransform/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMemset, cudaMalloc ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/fluidsD3D9/README.md b/Samples/5_Domain_Specific/fluidsD3D9/README.md index 47de23e2..ccf5ce0c 100644 --- a/Samples/5_Domain_Specific/fluidsD3D9/README.md +++ b/Samples/5_Domain_Specific/fluidsD3D9/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/fluidsGL/README.md b/Samples/5_Domain_Specific/fluidsGL/README.md index 267234d4..63e6bb7a 100644 --- a/Samples/5_Domain_Specific/fluidsGL/README.md +++ b/Samples/5_Domain_Specific/fluidsGL/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/fluidsGLES/README.md b/Samples/5_Domain_Specific/fluidsGLES/README.md index bfa5c17a..1f7f2b2d 100644 --- a/Samples/5_Domain_Specific/fluidsGLES/README.md +++ b/Samples/5_Domain_Specific/fluidsGLES/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaFreeArray, cudaFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/marchingCubes/README.md b/Samples/5_Domain_Specific/marchingCubes/README.md index d53729d7..7ba9ef54 100644 --- a/Samples/5_Domain_Specific/marchingCubes/README.md +++ b/Samples/5_Domain_Specific/marchingCubes/README.md @@ -30,7 +30,7 @@ cudaGLUnmapBufferObject, cudaGraphicsUnmapResources, cudaCreateChannelDesc, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/nbody/README.md b/Samples/5_Domain_Specific/nbody/README.md index aeb4a067..eac33a5e 100644 --- a/Samples/5_Domain_Specific/nbody/README.md +++ b/Samples/5_Domain_Specific/nbody/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaSetDeviceFlags, cudaGraphicsResourceSetMapFlags, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/nbody_opengles/README.md b/Samples/5_Domain_Specific/nbody_opengles/README.md index 28316827..b412de9b 100644 --- a/Samples/5_Domain_Specific/nbody_opengles/README.md +++ b/Samples/5_Domain_Specific/nbody_opengles/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaSetDeviceFlags, cudaGraphicsResourceSetMapFlags, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/nbody_screen/README.md b/Samples/5_Domain_Specific/nbody_screen/README.md index 66dbac15..75fc4470 100644 --- a/Samples/5_Domain_Specific/nbody_screen/README.md +++ b/Samples/5_Domain_Specific/nbody_screen/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaSetDeviceFlags, cudaGraphicsResourceSetMapFlags, ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/README.md b/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/README.md index 8b0ecf2d..b3231a33 100644 --- a/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/README.md +++ b/Samples/5_Domain_Specific/p2pBandwidthLatencyTest/README.md @@ -27,7 +27,7 @@ cudaSetDevice, cudaEventDestroy, cudaOccupancyMaxPotentialBlockSize, cudaCheckEr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/postProcessGL/README.md b/Samples/5_Domain_Specific/postProcessGL/README.md index e6c9b217..ac7145d7 100644 --- a/Samples/5_Domain_Specific/postProcessGL/README.md +++ b/Samples/5_Domain_Specific/postProcessGL/README.md @@ -30,7 +30,7 @@ cudaHostAlloc, cudaGraphicsUnmapResources, cudaMalloc, cudaFree, cudaGetChannelD ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/quasirandomGenerator/README.md b/Samples/5_Domain_Specific/quasirandomGenerator/README.md index 9b750620..e95e7787 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator/README.md +++ b/Samples/5_Domain_Specific/quasirandomGenerator/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMemset, cudaMemcpyToSymbol, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/README.md b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/README.md index 307c4926..bd73b432 100644 --- a/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/README.md +++ b/Samples/5_Domain_Specific/quasirandomGenerator_nvrtc/README.md @@ -30,7 +30,7 @@ cuMemcpyDtoH, cuMemAlloc, cuMemFree ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/recursiveGaussian/README.md b/Samples/5_Domain_Specific/recursiveGaussian/README.md index c674ba68..4c34a0dd 100644 --- a/Samples/5_Domain_Specific/recursiveGaussian/README.md +++ b/Samples/5_Domain_Specific/recursiveGaussian/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGraphicsResourceGetMappedP ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D10/README.md b/Samples/5_Domain_Specific/simpleD3D10/README.md index a9dc1687..00990bfc 100644 --- a/Samples/5_Domain_Specific/simpleD3D10/README.md +++ b/Samples/5_Domain_Specific/simpleD3D10/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaGetErrorString, cudaGraphicsResourceGetMappedPoi ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D10RenderTarget/README.md b/Samples/5_Domain_Specific/simpleD3D10RenderTarget/README.md index a69a552b..8cb57863 100644 --- a/Samples/5_Domain_Specific/simpleD3D10RenderTarget/README.md +++ b/Samples/5_Domain_Specific/simpleD3D10RenderTarget/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaMalloc, cudaUnbindTexture, cudaGetEr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D10Texture/README.md b/Samples/5_Domain_Specific/simpleD3D10Texture/README.md index 95f7f4c6..054ab6b3 100644 --- a/Samples/5_Domain_Specific/simpleD3D10Texture/README.md +++ b/Samples/5_Domain_Specific/simpleD3D10Texture/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMalloc, cudaMallocPitch, cudaGetErrorString, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D11/README.md b/Samples/5_Domain_Specific/simpleD3D11/README.md index d7fe68ed..f0939815 100644 --- a/Samples/5_Domain_Specific/simpleD3D11/README.md +++ b/Samples/5_Domain_Specific/simpleD3D11/README.md @@ -30,7 +30,7 @@ cudaImportKeyedMutex, cudaExternalMemoryGetMappedBuffer, cudaStreamCreateWithFla ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D11Texture/README.md b/Samples/5_Domain_Specific/simpleD3D11Texture/README.md index 52bc5f17..fc8820d4 100644 --- a/Samples/5_Domain_Specific/simpleD3D11Texture/README.md +++ b/Samples/5_Domain_Specific/simpleD3D11Texture/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMalloc, cudaMallocPitch, cudaGetErrorString, cud ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D12/README.md b/Samples/5_Domain_Specific/simpleD3D12/README.md index 90c1d003..dcefda20 100644 --- a/Samples/5_Domain_Specific/simpleD3D12/README.md +++ b/Samples/5_Domain_Specific/simpleD3D12/README.md @@ -30,7 +30,7 @@ cudaWaitExternalSemaphoresAsync, cudaExternalMemoryGetMappedBuffer, cudaImportEx ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D9/README.md b/Samples/5_Domain_Specific/simpleD3D9/README.md index e0153232..7a834b2f 100644 --- a/Samples/5_Domain_Specific/simpleD3D9/README.md +++ b/Samples/5_Domain_Specific/simpleD3D9/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaGraphicsResourceGetMappedPointer, cudaGetLastErr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleD3D9Texture/README.md b/Samples/5_Domain_Specific/simpleD3D9Texture/README.md index 25dba9e4..9e8122e5 100644 --- a/Samples/5_Domain_Specific/simpleD3D9Texture/README.md +++ b/Samples/5_Domain_Specific/simpleD3D9Texture/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMalloc, cudaMallocPitch, cudaFree, cudaGetLastEr ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleGL/README.md b/Samples/5_Domain_Specific/simpleGL/README.md index a8f42d35..911c2fa7 100644 --- a/Samples/5_Domain_Specific/simpleGL/README.md +++ b/Samples/5_Domain_Specific/simpleGL/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGraphicsResourceGetMappedP ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleGLES/README.md b/Samples/5_Domain_Specific/simpleGLES/README.md index 4ecca45d..d2564f1b 100644 --- a/Samples/5_Domain_Specific/simpleGLES/README.md +++ b/Samples/5_Domain_Specific/simpleGLES/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGraphicsResourceGetMappedP ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleGLES_EGLOutput/README.md b/Samples/5_Domain_Specific/simpleGLES_EGLOutput/README.md index a57af23d..ec705bec 100644 --- a/Samples/5_Domain_Specific/simpleGLES_EGLOutput/README.md +++ b/Samples/5_Domain_Specific/simpleGLES_EGLOutput/README.md @@ -35,7 +35,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGraphicsResourceGetMappedP ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleGLES_screen/README.md b/Samples/5_Domain_Specific/simpleGLES_screen/README.md index 6cb4545c..1783b665 100644 --- a/Samples/5_Domain_Specific/simpleGLES_screen/README.md +++ b/Samples/5_Domain_Specific/simpleGLES_screen/README.md @@ -30,7 +30,7 @@ cudaGraphicsUnmapResources, cudaMemcpy, cudaFree, cudaGraphicsResourceGetMappedP ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleVulkan/README.md b/Samples/5_Domain_Specific/simpleVulkan/README.md index 9c3e33bc..e0214c32 100644 --- a/Samples/5_Domain_Specific/simpleVulkan/README.md +++ b/Samples/5_Domain_Specific/simpleVulkan/README.md @@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaExternalMemoryGetMappedBuffer, cudaSignalSemaphor ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md b/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md index 3e5e54b5..669380b5 100644 --- a/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md +++ b/Samples/5_Domain_Specific/simpleVulkanMMAP/README.md @@ -33,7 +33,7 @@ cudaWaitExternalSemaphoresAsync, cudaImportExternalSemaphore, cudaDeviceGetAttri ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/smokeParticles/README.md b/Samples/5_Domain_Specific/smokeParticles/README.md index 60ae9bf2..f8daac32 100644 --- a/Samples/5_Domain_Specific/smokeParticles/README.md +++ b/Samples/5_Domain_Specific/smokeParticles/README.md @@ -30,7 +30,7 @@ cudaExtent, cudaPitchedPtr, cudaCreateTextureObject, cudaMemcpyToSymbol ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/stereoDisparity/README.md b/Samples/5_Domain_Specific/stereoDisparity/README.md index e5e44791..c2a45c1b 100644 --- a/Samples/5_Domain_Specific/stereoDisparity/README.md +++ b/Samples/5_Domain_Specific/stereoDisparity/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaEventSynchronize, cudaDeviceSynchronize, cudaCreateTex ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/5_Domain_Specific/volumeFiltering/README.md b/Samples/5_Domain_Specific/volumeFiltering/README.md index 5b97725a..1d19c24b 100644 --- a/Samples/5_Domain_Specific/volumeFiltering/README.md +++ b/Samples/5_Domain_Specific/volumeFiltering/README.md @@ -30,7 +30,7 @@ cudaMemcpy, cudaGraphicsMapResources, cudaDestroySurfaceObject, cudaExtent, cuda ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/volumeRender/README.md b/Samples/5_Domain_Specific/volumeRender/README.md index 845bd36d..84f8081e 100644 --- a/Samples/5_Domain_Specific/volumeRender/README.md +++ b/Samples/5_Domain_Specific/volumeRender/README.md @@ -30,7 +30,7 @@ cudaProfilerStop, cudaGraphicsUnmapResources, cudaMemcpy, cudaMallocArray, cudaF ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/5_Domain_Specific/vulkanImageCUDA/README.md b/Samples/5_Domain_Specific/vulkanImageCUDA/README.md index 07b2c5a5..3b5ab29e 100644 --- a/Samples/5_Domain_Specific/vulkanImageCUDA/README.md +++ b/Samples/5_Domain_Specific/vulkanImageCUDA/README.md @@ -30,7 +30,7 @@ cudaVkSemaphoreSignal, cudaWaitExternalSemaphoresAsync, cudaMemcpy, cudaVkImport ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/6_Performance/UnifiedMemoryPerf/README.md b/Samples/6_Performance/UnifiedMemoryPerf/README.md index b6c23f24..3ddd4748 100644 --- a/Samples/6_Performance/UnifiedMemoryPerf/README.md +++ b/Samples/6_Performance/UnifiedMemoryPerf/README.md @@ -28,7 +28,7 @@ cudaMemcpy, cudaStreamDestroy, cudaMemPrefetchAsync, cudaFree, cudaMallocHost, c ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Make sure the dependencies mentioned in [Dependencies]() section above are installed. ## Build and Run diff --git a/Samples/6_Performance/alignedTypes/README.md b/Samples/6_Performance/alignedTypes/README.md index 6b4c6805..c3d2a30c 100644 --- a/Samples/6_Performance/alignedTypes/README.md +++ b/Samples/6_Performance/alignedTypes/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaFree, cudaDeviceSynchronize, cudaMemset, cudaMalloc, cudaGetDevi ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/6_Performance/transpose/README.md b/Samples/6_Performance/transpose/README.md index cd559986..75997bbf 100644 --- a/Samples/6_Performance/transpose/README.md +++ b/Samples/6_Performance/transpose/README.md @@ -27,7 +27,7 @@ cudaMemcpy, cudaMalloc, cudaFree, cudaGetLastError, cudaEventSynchronize, cudaEv ## Prerequisites -Download and install the [CUDA Toolkit 12.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. +Download and install the [CUDA Toolkit 12.3](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. ## Build and Run diff --git a/Samples/7_libNVVM/CMakeLists.txt b/Samples/7_libNVVM/CMakeLists.txt index 2edecaad..6f9e7675 100644 --- a/Samples/7_libNVVM/CMakeLists.txt +++ b/Samples/7_libNVVM/CMakeLists.txt @@ -87,7 +87,7 @@ message(STATUS "Using rpath: ${CMAKE_INSTALL_RPATH}") # On Windows, locate the nvvm.dll so we can install it. if (WIN32) find_file(NVVM_DLL nvvm64_40_0.dll PATHS "${LIBNVVM_HOME}/bin") - if(NOT NVVM_DLL) + if (NOT NVVM_DLL) message(FATAL_ERROR "Found nvvm .h/.lib, but not .dll") endif() install(FILES ${NVVM_DLL} DESTINATION bin) @@ -98,10 +98,18 @@ add_definitions(-DLIBDEVICE_MAJOR_VERSION=1) add_definitions(-DLIBDEVICE_MINOR_VERSION=0) include_directories("${CMAKE_CURRENT_SOURCE_DIR}/common/include") -# Include the LLVM dev package which is required to build cuda-c-linking. -find_package(LLVM CONFIG PATHS "$ENV{LLVM_HOME}") -if (LLVM_FOUND) - add_subdirectory(cuda-c-linking) +# If you wish to build the cuda-c-linking sample and have the LLVM dependencies +# met, then set the ENABLE_CUDA_C_LINKING_SAMPLE variable. This variable can be +# set locally or by adding "-DENABLE_CUDA_C_LINKING_SAMPLE=1" to your cmake +# invocation. See the note about "cuda-c-linking" in README.md. +if (ENABLE_CUDA_C_LINKING_SAMPLE) + # Include the LLVM dev package which is required to build cuda-c-linking. + find_package(LLVM CONFIG PATHS "$ENV{LLVM_HOME}") + if (LLVM_FOUND) + add_subdirectory(cuda-c-linking) + else () + message(STATUS "Skipping the build of the cuda-c-linking sample: Failed to locate the LLVM package.") + endif () else () message(STATUS "Skipping the build of the cuda-c-linking sample.") endif () diff --git a/Samples/7_libNVVM/README.md b/Samples/7_libNVVM/README.md index a3b101c8..9aa39ed0 100644 --- a/Samples/7_libNVVM/README.md +++ b/Samples/7_libNVVM/README.md @@ -44,7 +44,7 @@ identify all of these paths. locally. This is only required for building the cuda-c-linking sample (see the cuda-c-linking note below). -After setting the environment variables and adding the path to the cmake tool +After setting the environment variables and adding the path to the CMake tool via the PATH environment variable, sample script utils/build.sh (for Linux) or utils/build.bat (for Windows) may be executed. This script will use build directory "build" to build the samples, and then install them in the "install" @@ -70,6 +70,11 @@ cuda-c-linking sample and have a locally built copy of LLVM that they wish to use. That sample requires the development package of LLVM with the LLVM header files and libraries. +If the LLVM dependencies are met, the user can enable the building of this +sample by setting the CMake variable "ENABLE_CUDA_C_LINKING_SAMPLE" from either +the command line invocation of CMake or by modifying the CMakeLists.txt in this +directory. + Windows users should download LLVM 14 sources from llvm.org and build+install LLVM locally. Using the llvm.org provided Windows installer lacks some of the required components the cuda-c-linking sample depends on. @@ -78,8 +83,8 @@ For Ubuntu users, the "llvm-dev" package contains the LLVM headers and libraries this sample requires, the user should not have to explicitly define an LLVM_HOME in this case. -Windows users will want to build this sample using the same cmake build mode +Windows users will want to build this sample using the same CMake build mode as they built LLVM with. For instance if they built LLVM in Release mode, then this sample should also be built in Release mode. The utils/build.bat can -be updated to reflect this: Add "-DCMAKE_BUILD_TYPE=Release" to the cmake +be updated to reflect this: Add "-DCMAKE_BUILD_TYPE=Release" to the CMake invocation. diff --git a/Samples/7_libNVVM/cuda-c-linking/README.md b/Samples/7_libNVVM/cuda-c-linking/README.md index 6ed393cd..3c150354 100644 --- a/Samples/7_libNVVM/cuda-c-linking/README.md +++ b/Samples/7_libNVVM/cuda-c-linking/README.md @@ -23,8 +23,9 @@ Files Building -------- -This sample is built as part of the libnvvm samples tree. Please see the -README file at the root of the libnvvm samples tree for build instructions. +This sample is optionally built as part of the libnvvm samples from the CUDA +samples tree. Please see the README file at the root of the libnvvm samples +for build instructions. Usage ----- diff --git a/Samples/7_libNVVM/uvmlite/README.md b/Samples/7_libNVVM/uvmlite/README.md index f7138621..643212af 100644 --- a/Samples/7_libNVVM/uvmlite/README.md +++ b/Samples/7_libNVVM/uvmlite/README.md @@ -40,7 +40,7 @@ retrieve a device pointer first, which can be done using cuModuleGetGlobal(). size_t size_xxx; // size of xxx result = cuModuleGetGlobal(&devp_xxx, &size_xxx, hModule, "xxx"); -Whether of not the pointer points to a managed memory may be queried +Whether or not the pointer points to managed memory may be queried by calling cuPointerGetAttribute() with the pointer attribute CU_POINTER_ATTRIBUTE_IS_MANAGED. diff --git a/Samples/7_libNVVM/uvmlite/uvmlite.c b/Samples/7_libNVVM/uvmlite/uvmlite.c index 745436fa..3f8c54b9 100644 --- a/Samples/7_libNVVM/uvmlite/uvmlite.c +++ b/Samples/7_libNVVM/uvmlite/uvmlite.c @@ -34,6 +34,12 @@ #include #include +#define ERROR_IF(expr) \ + if (expr) { \ + fprintf(stderr, "Failed check at %s:%d\n", __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } + // If 'err' is non-zero, emit an error message and exit. #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) static void __checkCudaErrors(CUresult err, const char *filename, int line) { @@ -221,7 +227,7 @@ static CUresult buildKernel(CUcontext *phContext, CUdevice *phDevice, return CUDA_SUCCESS; } -int main(int argc, char **argv) { +int main(void) { const unsigned int nThreads = 1; const unsigned int nBlocks = 1; @@ -245,7 +251,7 @@ int main(int argc, char **argv) { int attrVal; checkCudaErrors(cuDeviceGetAttribute( &attrVal, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, hDevice)); - assert(attrVal == 1); + ERROR_IF(attrVal != 1); } // Get the address of the variable xxx, yyy in the managed memory. @@ -260,23 +266,10 @@ int main(int argc, char **argv) { checkCudaErrors(cuPointerGetAttribute( &attrVal, CU_POINTER_ATTRIBUTE_IS_MANAGED, devp_xxx)); - assert(attrVal == 1); + ERROR_IF(attrVal != 1); checkCudaErrors(cuPointerGetAttribute( &attrVal, CU_POINTER_ATTRIBUTE_IS_MANAGED, devp_yyy)); - assert(attrVal == 1); - } - - // The "physical" memory location of the memory that the devp_yyy addresses is - // the device memory type. - { - unsigned int attrVal; - - checkCudaErrors(cuPointerGetAttribute( - &attrVal, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, devp_xxx)); - assert(attrVal == CU_MEMORYTYPE_DEVICE); - checkCudaErrors(cuPointerGetAttribute( - &attrVal, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, devp_yyy)); - assert(attrVal == CU_MEMORYTYPE_DEVICE); + ERROR_IF(attrVal != 1); } // Since CUdeviceptr is opaque, it is safe to use cuPointerGetAttribute to get @@ -296,8 +289,8 @@ int main(int argc, char **argv) { printf("The initial value of xxx initialized by the device = %d\n", *p_xxx); printf("The initial value of yyy initialized by the device = %d\n", *p_yyy); - assert(*p_xxx == 10); - assert(*p_yyy == 100); + ERROR_IF(*p_xxx != 10); + ERROR_IF(*p_yyy != 100); // The host adds 1 and 11 to xxx and yyy. *p_xxx += 1;