Compare commits

...

324 Commits

Author SHA1 Message Date
Rob Armstrong
8a9e2c830c
Update 1_Utilities/README.md to redirect bandwidthTest to NVBandwidth (#371) 2025-05-22 11:43:14 -07:00
Rob Armstrong
adacf1cffd
Merge pull request #368 from XSShawnZeng/master
Update the vulkan headers include sequence and the transpose code format check
2025-05-21 09:27:13 -07:00
shawnz
da3b7a2b3c Update the vulkanImageCUDA/vulkanImageCUDA.cu for Windows headers 2025-05-19 17:43:08 +08:00
shawnz
5987a9e9fa Update transpose for code format check 2025-05-19 17:38:42 +08:00
shawnz
107f3f537f Update the include files sequence for vulkan samples on Windows 2025-05-19 17:38:22 +08:00
Francesco Rizzi
b530f1cf42
Fix bug in 6_Performance/transpose: copy sharedmem kernel (#363)
Update kernel loop bounds handling, main loop data copy to avoid incorrect reuse of output results.

---------

Authored-by: Francesco Rizzi <francesco.rizzi@ng-analytics.com>
2025-05-05 08:43:23 -07:00
Rob Armstrong
cab7c66b4f Update pre-config to include Python and JSON for EOL, whitespace checks 2025-05-01 10:17:42 -07:00
Rob Armstrong
8d400cfb7f Additional minor changes to run_tests.py output formatting 2025-05-01 10:14:09 -07:00
Rob Armstrong
6d6d964f97 Minor changes to run_tests.py output formatting 2025-05-01 09:54:25 -07:00
Rob Armstrong
ab68d58d59 Remove unused bin/x86_64 directory hierarchy 2025-05-01 09:53:54 -07:00
Rob Armstrong
c70d79cf3b Final 12.9 README updates 2025-05-01 09:39:06 -07:00
Rob Armstrong
14b1bfdcc4 Replace README references to "CUDA Toolkit 12.5" with general "CUDA Toolkit" 2025-04-30 09:46:45 -07:00
Rob Armstrong
c14a0114d6 Some samples require multiple GPUs. Update 'run_tests.py' to skip them on single- or no-GPU systems. 2025-04-30 09:45:20 -07:00
Rob Armstrong
ee15cc0fe2 Merge branch 'shawnz_bugs_fix' into 'master'
Bug fix for 5241914, 5164417 and 5097376

See merge request cuda-samples/cuda-samples!107
2025-04-28 08:53:11 -07:00
shawnz
3438fd4875 Update README for OpenMP 2025-04-28 23:44:45 +08:00
shawnz
b27b55ec70 Bug 5241914: Fix the error message for cuSolverDn_LinearSolver 2025-04-27 16:57:02 +08:00
shawnz
49159f3739 Bug 5164417 and 5097376: Fix the OpenMP issue finding issue for MSVC and Glang 2025-04-27 16:50:12 +08:00
Rob Armstrong
1680a1dc7f Update Windows FreeImage configuration instructions in README.md 2025-04-21 09:20:22 -07:00
Rob Armstrong
49daf0e4e0 Merge Bug 5199167: Fix the includes issue for 5_Domain_Specific\simpleD3D12
See merge request cuda-samples/cuda-samples!106
2025-04-21 08:11:52 -07:00
shawnz
a45fd3bd7c Bug 5199167: Fix the includes issue for 5_Domain_Specific\simpleD3D12 2025-04-21 11:52:33 +08:00
Rob Armstrong
0345908807 Update run_tests.py to enable multithreading 2025-04-07 08:48:44 -07:00
Rob Armstrong
3b9c8ce2e9 Merge branch 'shawnz_bugs_fix' into 'master'
Bug 5207005: Append pid in shmName for Linux only as this is for MIG scenario

See merge request cuda-samples/cuda-samples!100
2025-04-07 08:21:40 -07:00
shawnz
e77d6eb5ab Bug 5207005: Append pid in shmName for Linux only as this is for MIG scenario 2025-04-07 17:17:17 +08:00
Rob Armstrong
ac700327a2 Add folders to CMakeLists.txt for supporting generators and IDEs 2025-04-05 09:54:24 -07:00
Rob Armstrong
17703dd426 Merge branch 'shawnz_bugs_fix' into 'master'
Bug 5196977: Update includes for nbody

See merge request cuda-samples/cuda-samples!98
2025-04-03 01:16:20 -07:00
shawnz
a32d5badf7 Bug 5196977: Update includes for nbody 2025-04-03 15:30:05 +08:00
Rob Armstrong
1fd22429c3 Merge branch 'shawnz_bugs_fix' into 'master'
Change for fixing bugs: 5196977, 4914019, 4191696 and 5199167 .

See merge request cuda-samples/cuda-samples!97
2025-04-02 22:28:17 -07:00
Rob Armstrong
00ac0a1673 Remove bandwidthTest subdirectory from CMakeLists.txt 2025-04-02 22:27:30 -07:00
shawnz
b013387a39 Update code format 2025-04-03 11:23:26 +08:00
Rob Armstrong
9d921e0fe7 Add CONTRIBUTING.md 2025-04-02 11:29:16 -07:00
Rob Armstrong
7d1730f348 Remove outdated bandwidthTest sample 2025-04-02 11:19:48 -07:00
shawnz
718fe6486d Bug 5199167: Adjust the include header files sequence for simpleD3D11/simpleD3D11Texture 2025-04-02 15:10:29 +08:00
shawnz
ad9908e32b Bug4914019 & 4191696: Append pid in shmName for MIG multiple thread scenario 2025-04-02 11:20:09 +08:00
shawnz
952d6edf92 Bug 5196977: Include helper_gl.h before cuda_gl_interop.h 2025-04-01 16:07:32 +08:00
Rob Armstrong
685709bfc7 Merge branch 'shawnz_bugs_fix' into 'master'
Bug fix for bug 5194249, 5188945 and 5164374

See merge request cuda-samples/cuda-samples!95
2025-03-31 08:00:50 -07:00
shawnz
0c92c34ca9 Bug 5164374: Remove the register keyword has been deprecated and removed from the C++17 standard 2025-03-31 15:13:56 +08:00
shawnz
0d82634f70 5188945: Add freeglut and glew64 .dll files for minsizeRel/RelWithDebInfo build 2025-03-31 15:07:29 +08:00
shawnz
4abbdf4e80 Bug 5194249: Need to include cuda_runtime.h for cudaNvSci after the clang format change 2025-03-31 14:57:31 +08:00
Rob Armstrong
914ca00f89 Small update to README.md to clarify test script usage. 2025-03-28 15:16:10 -07:00
Rob Armstrong
c8034f368a Add helper utility to test run all built samples (see README.md for usage details) 2025-03-28 15:07:07 -07:00
Rob Armstrong
ceab6e8bcc Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks. 2025-03-27 10:30:07 -07:00
Rob Armstrong
2cd58fbc9a Update README version for 12.9 2025-03-26 10:24:22 -07:00
Rob Armstrong
c0ab53f986 Update all sample CMakeLists.txt to include ENABLE_CUDA_DEBUG flag to enable cuda-gdb 2025-03-26 10:08:59 -07:00
Rob Armstrong
b87c243bbb Add -lineinfo flag to all targets to include line information for developer tools 2025-03-26 09:44:20 -07:00
Rob Armstrong
e214cd29aa Update gencode arguments for separate kernel fatbin builds 2025-03-26 09:28:37 -07:00
Rob Armstrong
06d72496c2 Merge branch 'shawnz_tegra_crossbuild_toolchain' into 'master'
Bug 5133197: Add cmake toolchain and and update the CMakeList of some sample...

See merge request cuda-samples/cuda-samples!94
2025-03-25 14:52:02 -07:00
shawnz
2848d3bd21 Bug 5176886: Enable nvJPEG samples for aarch64 2025-03-21 13:02:14 +08:00
shawnz
bd0f630bf4 Bug 5133197: Add cmake toolchain and and update the CMakeList of some sample for tegra linux cross build 2025-03-20 12:43:44 +08:00
shawnz
ab9166a6b2 Bug 5139353 and 5139213: Enhancement for streamOrderedAllocationIPC 2025-03-12 15:28:54 +08:00
Rob Armstrong
c90a1c6981 Merge public repo changes 2025-03-08 08:30:35 -08:00
Rob Armstrong
9370f11e69 graphConditionalNodes: Additional tweaks to launch dimension initialization (#348) 2025-03-05 18:18:37 -08:00
Rob Armstrong
291435e0b4
graphConditionalNodes: Additional tweaks to launch dimension initialization (#348) 2025-03-05 18:17:27 -08:00
Rob Armstrong
8d901e745d graphConditionalNodes: Change launch dimension initialization for better cross-platform compatibility (#346) 2025-03-05 08:33:35 -08:00
Rob Armstrong
990ebc01c2
graphConditionalNodes: Change launch dimension initialization for better cross-platform compatibility (#346) 2025-03-05 08:32:58 -08:00
Shawn Zeng
9adce9d9f2 Update file CMakeLists.txt 2025-03-03 19:19:50 -08:00
Rob Armstrong
bcad2c9e61 graphConditionalNodes: Add switch, while, if/else conditional examples and minor cleanup (#344) 2025-03-03 17:50:22 -08:00
Rob Armstrong
e7b23470d5
graphConditionalNodes: Add switch, while, if/else conditional examples and minor cleanup (#344) 2025-03-03 17:49:17 -08:00
Shawn Zeng
310e7f2a11 Bug 5143332: Remove the redundant content in 0_Introduction/CMakeLists.txt 2025-03-03 17:37:48 -08:00
Shawn Zeng
7f0f63f311 Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP 2025-02-27 03:01:47 -08:00
Shawn Zeng
acd3a015c8 Revert "Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP"
This reverts commit a9869fd6eaeecc748fc5f10f4b331fa41efbdaca
2025-02-27 02:48:03 -08:00
shawnz
a9869fd6ea Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP 2025-02-27 18:43:53 +08:00
XSShawnZeng
3e8f91d1a1
Several small bug fixes for Windows platforms
* Enhancement for GLFW include and lib search

* Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp

* Update CMakelist.txt for the sample 0_Introduction/template

* Copy .dll to correct dir for 5_Domain_Specific/Mandelbrot

* Fix typo

* Update changelog for cudaNvSciBufMultiplanar
2025-02-26 08:23:39 -08:00
Jonathan Bentz
f3b7c41ad6
cudaNvSci: Update README.md fixing typo (#337)
Fixes #193
2025-02-21 09:21:43 -08:00
Jonathan Bentz
29fb758e62
conjugateGradient: Ensure allocated memory is freed (#336)
Fixes #202
2025-02-21 09:20:53 -08:00
Jonathan Bentz
3bc08136ff
Update README.md link for sortingNetworks (#335)
Fixes #302
2025-02-21 09:19:21 -08:00
Jonathan Bentz
85eefa06c4
boxFilter: Remove unused parameter (#338)
Fixes: #122
2025-02-21 09:17:45 -08:00
XSShawnZeng
c357dd1e6b
Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp (#334) 2025-02-21 09:14:25 -08:00
Jonathan Bentz
efb46383e0
Transpose: Change TILE_DIM to 32 to fix bank conflicts
Fixes #175
2025-02-20 15:46:44 -08:00
XSShawnZeng
8d564d5e3a
Enhancement for GLFW include and lib search (#331)
Fixes NVIDIA bug 5115098
2025-02-20 08:06:40 -08:00
Jake Hemstad
37c5bcbef4 Update kernels.cuh 2025-02-19 17:33:10 -08:00
Rob Armstrong
940a4c7a91
memMapIpc: Resolve build-time warnings and minor potential issues (#329)
* Fix compute performance calculation type casting in gpuGetMaxGflopsDeviceIdDRV() for #109

* 3_CUDA_Features/memMapIPCDrv: Increase procIdx buffer size to prevent potential buffer overflow

* memMapIPCDrv: Fix memory leaks and improve header inclusion

- Remove redundant string.h header
- Add memory cleanup for dynamically allocated JIT options and log buffer
- Fix printf format specifier for unsigned long long
2025-02-19 15:52:20 -08:00
ohmaya
61bd39800d
simplePrintf.cu: "Compute capability" text (#299)
Compute %d.%d capability => Compute capability %d.%d
2025-02-19 15:22:34 -08:00
Rob Armstrong
8a96d2eee7
Fix compute performance calculation type casting in gpuGetMaxGflopsDeviceIdDRV() for #109 2025-02-19 10:43:18 -08:00
Rob Armstrong
e762d58260
Merge pull request #247 from sangeetsatheesh/master
Fix typo from Open issue #161
2025-02-18 17:22:48 -08:00
Rob Armstrong
8fd1701744
Merge branch 'master' into master 2025-02-18 17:22:04 -08:00
Rob Armstrong
94765c1597
Fix minor typo in README.md (#326) 2025-02-18 17:14:14 -08:00
Rob Armstrong
c87881f02c
Update matrix multiplication sample README references (#325)
- Clarify reference to Shared Memory section in CUDA programming guide
- Update cuBLAS interface version description
- Add hyperlink to Shared Memory documentation
2025-02-18 14:02:59 -08:00
Rob Armstrong
25400b6b3c
Merge pull request #287 from steffen-v/patch-1
fix "gridy" comandline argument for initMC
2025-02-18 13:30:27 -08:00
Rob Armstrong
e24f62e28c
Fix README.md version number typo
Fix inadvertent reference to prior release in README.md
2025-02-15 13:37:51 -08:00
Rob Armstrong
db3eea2394
Update CUDA Samples for CTK 12.8 release and migrate build system to CMake
Update CUDA Samples for CTK 12.8 release and migrate build system to CMake
2025-02-15 13:23:26 -08:00
Rob Armstrong
04f3686bbe
Merge pull request #24 from XSShawnZeng/master
Enhancement for finding GLFW on WIN and copy .dll files to executable…
2025-02-14 15:03:34 -08:00
shawnz
0e87b76137 Update README 2025-02-14 22:46:04 +08:00
shawnz
fb6fcb0110 Enhancement for finding GLFW on WIN and copy .dll files to executable dir for some samples 2025-02-14 22:37:51 +08:00
Rob Armstrong
14b8ceb56f
Merge pull request #23 from XSShawnZeng/master
Add SM support for simpleAtomicIntrinsics
2025-02-12 22:40:36 -08:00
shawnz
a6737fd72b Add SM support for simpleAtomicIntrinsics 2025-02-13 11:53:55 +08:00
Rob Armstrong
96901090bc nvJPEG: Modify write_images function to return void instead of int
- Changed return type from int to void
- Removed EXIT_FAILURE return in error case (unchecked)
- Removed control reaches end of non-void function
2025-02-12 11:49:35 -08:00
Rob Armstrong
8b2b51e2a5 NV12toBGRandResize: Fix potential buffer overflow in file output functions
- Increased filename buffer sizes from 120 to 256 characters
- Replaced sprintf() with snprintf() to prevent potential buffer overflows
2025-02-12 11:41:53 -08:00
Rob Armstrong
dcce6e1f14
Merge pull request #22 from XSShawnZeng/master
Update source code for cudaGraphPerfScaling and remove dupe target in cuda-c-linking CMakeLists.txt
2025-02-12 08:14:25 -08:00
shawnz
cc3d94f81c Update .dll copy for 7_libNVVM 2025-02-12 18:12:26 +08:00
shawnz
7ee6db679e Remove dupe target in cuda-c-linking CMakeLists.txt 2025-02-12 15:49:29 +08:00
shawnz
24a617c043 Update source code for cudaGraphPerfScaling 2025-02-12 12:22:55 +08:00
Rob Armstrong
93f1c78c5b 2_Concepts_and_Techniques/EGLStream_CUDA_Interop: Update types for CUDA consumer to use const char* to resolve build warnings 2025-02-11 17:41:59 -08:00
Rob Armstrong
5932d18738 Fix warning about potential string overflow in 0_Introduction/simpleIPC 2025-02-11 17:31:36 -08:00
Rob Armstrong
5206607816
Merge pull request #21 from XSShawnZeng/master
Copy all the needed .ll and .dll for 7_libNVVM
2025-02-11 08:14:17 -08:00
shawnz
f8f3e1b347 Add new line at end of files 2025-02-11 17:05:29 +08:00
shawnz
4fcfa82d7d Copy all the needed .ll and .dll for 7_libNVVM 2025-02-11 17:01:43 +08:00
Rob Armstrong
7a9bd38ecc Update OpenGL sample build settings on Windows 2025-02-10 23:12:38 -08:00
Rob Armstrong
9582bb03a9
Merge pull request #20 from XSShawnZeng/master
Turn on assert log for SimpleAssert and copy files to folder of executable for watershedSegmentationNPP
2025-02-10 08:06:33 -08:00
shawnz
11bc856cad Turn on assert log for SimpleAssert and copy files to folder of executable for watershedSegmentationNPP 2025-02-10 17:02:44 +08:00
Rob Armstrong
95308ffc23 Add missing build targets to general samples 2025-02-08 13:04:26 -05:00
Rob Armstrong
56852fbb50 Add explicit system libraries needed on some Linux distributions 2025-02-07 15:43:17 -05:00
Rob Armstrong
152ba4b941
Merge pull request #19 from XSShawnZeng/master
Bug 5097243: Add nvJitLink lib for Windows
2025-02-07 09:42:33 -08:00
shawnz
a8138b60fe Bug 5097243: Add nvJitLink lib for Windows 2025-02-07 17:28:10 +08:00
Rob Armstrong
c7bfd4418b
Merge pull request #18 from jnbntz/nvvm_updates
changing to CMAKE_CURRENT_BINARY_DIR for the copy of input files
2025-02-06 08:31:23 -08:00
Rob Armstrong
eb378b2fce
Merge branch 'master' into nvvm_updates 2025-02-06 08:31:03 -08:00
Rob Armstrong
fd5674f9e5
Merge pull request #17 from XSShawnZeng/master
Fix the nvsci lib finding issue for auto-linux and update sm list for Tegra samples
2025-02-06 08:28:21 -08:00
Rob Armstrong
bb2aebaad8 Remove unused duplicate targets in libNVVM samples 2025-02-06 08:25:35 -08:00
shawnz
2539826a99 Fix the nvsci lib finding issue for auto-linux and update sm list for Tegra samples 2025-02-06 17:05:48 +08:00
Jonathan Bentz
1eb3c947c3 changing to CMAKE_CURRENT_BINARY_DIR for the copy of input files 2025-02-05 19:57:07 -06:00
Rob Armstrong
9d03b030a6
Merge pull request #16 from XSShawnZeng/master
Adding support for Blackwell
2025-02-04 21:46:10 -08:00
shawnz
78f83ca02d Adding suport for Blackwell 2025-02-05 11:17:06 +08:00
Rob Armstrong
e8a041d783 Ensure -Wno-deprecated-gpu-targets is passed to PTX and fatbin compilation steps 2025-02-04 12:07:51 -08:00
Rob Armstrong
c631850c15 Remove compute capability 8.7 build target for non-Tegra builds 2025-02-04 12:01:38 -08:00
Rob Armstrong
9539ca5fa3 Remove compute capability 7.2 build target for non-Tegra builds 2025-02-04 11:58:25 -08:00
Rob Armstrong
0f4bdfad99 CMake: Add '-Wno-deprecated-gpu-targets' to suppress warning messages during build about Maxwell, Pascal, and Volta 2025-02-04 11:51:17 -08:00
Rob Armstrong
e3a5ae4aca Library samples: Fix malformed CMakeLists.txt 2025-02-04 10:01:28 -08:00
Rob Armstrong
30b411ad56 boxFilter: Fix malformed CMakeLists.txt 2025-02-04 10:01:28 -08:00
Rob Armstrong
941670671c
Merge pull request #15 from rwarmstr/cmake_transition
Cmake transition
2025-02-03 11:14:18 -08:00
Rob Armstrong
be5012ef69 Add automotive Linux build instructions 2025-01-27 10:16:33 -08:00
Rob Armstrong
9824a63101
Merge pull request #14 from XSShawnZeng/Tegra_Samples_Cmake_Transition
Update CMakeList.txt for simpleCUFFT_callback and GLES samplesTegra samples cmake transition
2025-01-24 08:44:47 -08:00
XSShawnZeng
5f6e6072d9
Merge pull request #8 from XSShawnZeng/cmake_transition
Merge pull request #13 from XSShawnZeng/Tegra_Samples_Cmake_Transition
2025-01-24 15:45:22 +08:00
shawnz
a80688012e Update CMakeList.txt for simpleCUFFT_callback and GLES samples 2025-01-24 15:44:19 +08:00
Rob Armstrong
aad89bb0ab
Merge pull request #13 from XSShawnZeng/Tegra_Samples_Cmake_Transition
Update Tegra SMs
2025-01-23 08:39:43 -08:00
shawnz
ab0b386ac4 Update tegra SMs] 2025-01-23 16:20:34 +08:00
XSShawnZeng
5cffc8815a
Merge pull request #7 from XSShawnZeng/cmake_transition
Merge pull request #12 from XSShawnZeng/Tegra_Samples_Cmake_Transition
2025-01-23 16:19:49 +08:00
Rob Armstrong
27c0a166f3
Merge pull request #12 from XSShawnZeng/Tegra_Samples_Cmake_Transition
Tegra samples cmake transition
2025-01-22 21:46:36 -08:00
shawnz
08fae276b4 Add Tegra SMs in CMakeLists.txt general samples 2025-01-23 11:02:56 +08:00
XSShawnZeng
0f697f1819
Merge pull request #6 from XSShawnZeng/cmake_transition
Cmake transition
2025-01-23 10:43:47 +08:00
Rob Armstrong
b203467419 Update CUDA architectures list 2025-01-22 17:49:44 -08:00
Rob Armstrong
41d5f63ec5 Add note about QNX to main README 2025-01-22 09:51:31 -08:00
Rob Armstrong
3de3f2f46c
Merge pull request #11 from XSShawnZeng/Tegra_Samples_Cmake_Transition
Update separate SM list for cdp samples and update the watershedSegmentationNPP for build failed of Bug 4668487
2025-01-21 09:38:41 -08:00
XSShawnZeng
95128a1cf3
Update CMakeLists.txt 2025-01-20 10:05:45 +08:00
XSShawnZeng
3848a7c63c
Update CMakeLists.txt 2025-01-20 10:05:18 +08:00
XSShawnZeng
d9633314f8
Update CMakeLists.txt 2025-01-20 10:04:50 +08:00
XSShawnZeng
576c24f97e
Update CMakeLists.txt 2025-01-20 10:04:24 +08:00
XSShawnZeng
970a6a1151
Update CMakeLists.txt 2025-01-20 10:04:02 +08:00
shawnz
76e2d2052c Update separate SM list for cdp samples and update the watershedSegmentationNPP for build failed of Bug 4668487 2025-01-17 12:04:27 +08:00
XSShawnZeng
03a7300081
Merge pull request #5 from XSShawnZeng/cmake_transition
Cmake transition
2025-01-17 10:51:43 +08:00
Rob Armstrong
ccb341a5e3 Add placeholder toolchain file for QNX cross-compilation 2025-01-16 12:59:26 -08:00
Rob Armstrong
421e49450f Remove outdated build and run information from cudaNvSciBufMultiplanar 2025-01-16 09:06:06 -08:00
Rob Armstrong
886860a123 Ignore local .clangd if it exists 2025-01-16 09:04:37 -08:00
Rob Armstrong
9a454d3ba9
Merge pull request #9 from jnbntz/cmake_transition
Copying input files into BINDIR so that these samples will run properly
2025-01-16 09:02:53 -08:00
Rob Armstrong
1a466282da
Merge pull request #10 from XSShawnZeng/Tegra_Samples_Cmake_Transition
Add Tegra sample cudaNvSciBufMultiplanar
2025-01-16 09:01:44 -08:00
shawnz
545194e7aa Add Tegra sample cudaNvSciBufMultiplanar 2025-01-16 12:22:52 +08:00
XSShawnZeng
e2d3c20bd3
Merge pull request #4 from XSShawnZeng/cmake_transition
Cmake transition
2025-01-16 12:05:33 +08:00
Jonathan Bentz
48e70c8b31 Copying input files into BINDIR so that these samples will run properly 2025-01-15 19:22:54 -06:00
Rob Armstrong
b518bfe9be
Merge pull request #8 from jnbntz/cmake_transition
fixing a couple typos in cmakelists files
2025-01-15 08:43:08 -08:00
Jonathan Bentz
e633580eff a couple more typos in comments 2025-01-14 18:12:09 -06:00
Jonathan Bentz
df9ea79df4 typo errors in a few CMakelists files 2025-01-14 18:09:26 -06:00
Rob Armstrong
eacf41c980 Update cudaNvSciNvMedia CMake search path 2025-01-14 09:20:38 -08:00
Rob Armstrong
f8fbd04007 Update CMake module search path 2025-01-14 09:14:29 -08:00
XSShawnZeng
01956cfecc
Merge pull request #3 from XSShawnZeng/cmake_transition
Cmake transition
2025-01-13 11:10:07 +08:00
Rob Armstrong
380c065a0c Move EGLSync_CUDAEvent_Interop directory entry 2025-01-10 11:15:11 -08:00
Rob Armstrong
6c548b5d43 Update Changelog for removed and moved samples 2025-01-10 08:17:00 -08:00
Rob Armstrong
11c70243db Move EGLSync_CUDAEvent_Interop to Tegra samples 2025-01-10 08:12:56 -08:00
Rob Armstrong
415ad05c98 Remove remaining legacy Makefiles 2025-01-10 08:05:41 -08:00
Rob Armstrong
2d0314212b Remove Tegra QNX samples nbody_screen, simpleGLES_screen 2025-01-10 08:02:43 -08:00
Rob Armstrong
feffc60cbf
Merge pull request #7 from XSShawnZeng/Tegra_Samples_Cmake_Transition
Update the CMakeList for remaining tegra samples and remove the old M…
2025-01-10 07:57:39 -08:00
shawnz
d3ded4a251 Update the CMakeList for remaining tegra samples and remove the old Makefile/NsightEclipse.xml 2025-01-10 17:11:33 +08:00
Rob Armstrong
62b96a65b5 Add glfw3 existence checks to Vulkan samples 2025-01-09 22:48:37 -08:00
Rob Armstrong
450038ea73
Merge pull request #6 from jnbntz/aarch64_guarding
Changes for building for aarch64, specifically jetson orin nano
2025-01-09 10:06:09 -08:00
Rob Armstrong
ce045e2ae9
Merge branch 'cmake_transition' into aarch64_guarding 2025-01-09 10:03:16 -08:00
Rob Armstrong
f753e86e7a Update all samples to build position-independent code 2025-01-09 09:59:36 -08:00
Jonathan Bentz
a1cf9e4183 Changes for building for aarch64, specifically jetson orin nano 2025-01-08 17:05:45 -06:00
Rob Armstrong
e8492d1a78
Merge pull request #5 from XSShawnZeng/tegra_samples_cmake_transition
Tegra samples cmake transition for cudaNvSci and cuDLA samples
2025-01-08 09:03:13 -08:00
XSShawnZeng
11355321a0
Create CMakeLists.txt for cuDLAStandaloneMode 2025-01-08 16:05:34 +08:00
XSShawnZeng
3532ede709
Create CMakeLists.txt for cuDLALayerwiseStatsStandalone 2025-01-08 16:04:59 +08:00
XSShawnZeng
8f1d565faa
Create CMakeLists.txt for cuDLALayerwiseStatsHybrid 2025-01-08 16:04:06 +08:00
XSShawnZeng
f00f52d154
Create CMakeLists.txt for cuDLAHybridMode 2025-01-08 16:03:24 +08:00
XSShawnZeng
dc24cde377
Create CMakeLists.txt for cuDLAErrorReporting 2025-01-08 16:02:46 +08:00
XSShawnZeng
2c5c6dc7d4
Create CMakeLists.txt for cudaNvSci 2025-01-08 16:01:50 +08:00
XSShawnZeng
3fc438428a
Update Tegra CMakeLists.txt 2025-01-08 16:00:21 +08:00
Rob Armstrong
5409227cf8
Merge pull request #4 from XSShawnZeng/tegra_samples_nboby_opengles
Update the CMakeLists.txt for 3 Tegra samples
2025-01-06 09:38:21 -08:00
XSShawnZeng
e36545ac1c
Create CMakeLists.txt 2025-01-06 18:49:01 +08:00
XSShawnZeng
41a65b94c8
Update CMakeLists.txt to include the Tegra samples 2025-01-06 18:48:12 +08:00
XSShawnZeng
89a84131fb
Update CMakeLists.txt for Tegra sample simpleGLES 2025-01-06 18:47:18 +08:00
XSShawnZeng
bb121fe02b
the CMakeLists.txt for Tegra sample fluidsGLES
Able to build the sample as below:
$ make
Linking CUDA device code CMakeFiles/fluidsGLES.dir/cmake_device_link.o
Linking CXX executable fluidsGLES
Built target fluidsGLES
2025-01-06 17:52:53 +08:00
XSShawnZeng
e228ad9389
Update the CMakeLists.txt for Tegra sample nbody_opengles 2025-01-06 17:12:28 +08:00
Rob Armstrong
1e0c660f22 Move GLES and QNX samples to platform-specific subdirectory 2024-12-27 22:56:03 -08:00
Rob Armstrong
e52cdee6c4 Unify Windows-only sample messages 2024-12-26 17:26:37 -08:00
Rob Armstrong
9045dd3a1a
Merge pull request #3 from jnbntz/windows_fixes
Windows fixes for cmake copy commands and some additional nvrtc libs needed on windows builds
2024-12-20 13:25:33 -08:00
Jonathan Bentz
3e7bb5f2e8 updated to fix for Linux 2024-12-20 12:52:06 -06:00
Jonathan Bentz
88e5f4a395 Fixing more copy files in nvrtc builds. 2024-12-20 09:28:17 -06:00
Jonathan Bentz
5c9e573ece fixing copy files for cuSolverRf and cuSolverPs_LowlevelCholesky 2024-12-20 07:48:01 -06:00
Jonathan Bentz
c08a2f31ff fixing copy for windows build cuSolverSp_LinearSolver 2024-12-20 07:31:51 -06:00
Jonathan Bentz
6fe4a6bab8 fix copy for cuSolverDN_LinearSolver 2024-12-19 18:12:20 -06:00
Jonathan Bentz
66631e4f96 cuSolverSp_LowLeverQR copy files individually 2024-12-19 17:44:56 -06:00
Jonathan Bentz
7e1a257265 adding USE_MATH_DEFINES to simpleCUFFT_2d_MGPU 2024-12-19 16:35:19 -06:00
Rob Armstrong
d9efeae3bb
Merge pull request #2 from jnbntz/linux_guarding
Guarding Linux-only samples so they only get build on Linux machines
2024-12-19 11:25:13 -08:00
Jonathan Bentz
9c4287fc51 one more fix on guarding Linux 2024-12-19 13:20:48 -06:00
Jonathan Bentz
8d06d246b3 Updating for more linux guarding 2024-12-19 13:15:39 -06:00
Jonathan Bentz
1292881b37 reverting to previous commit 2024-12-19 10:53:11 -06:00
Rob Armstrong
a55d6682d2 Update DirectX samples for Windows builds 2024-12-18 20:06:37 -08:00
Jonathan Bentz
f9dd4323af adding code to Linux-only samples to guard against building on Windows 2024-12-18 14:36:12 -06:00
Rob Armstrong
fcd39008ec Add BUILD_TEGRA flag 2024-12-18 11:28:38 -08:00
Rob Armstrong
01b7ee41f6 Update DirectX samples 2024-12-18 11:28:26 -08:00
Rob Armstrong
25b33d2d04 Update top-level build instructions 2024-12-18 11:02:35 -08:00
Rob Armstrong
bfd956bc5e Remove README references to PPC processors (no longer supported) 2024-12-18 10:54:37 -08:00
Rob Armstrong
0f5821a8c8 Remove outdated build instructions from README.md 2024-12-18 10:52:24 -08:00
Rob Armstrong
c1301d000a Resolve Windows build issues 2024-12-17 22:11:52 -08:00
Rob Armstrong
7f5859dda9 Update .gitignore for Visual Studio builds 2024-12-17 21:46:53 -08:00
Rob Armstrong
abb97e1dfb Update copy_directory_if_different to copy_directory for CMake 3.20 compatibility 2024-12-17 10:22:02 -08:00
Rob Armstrong
22bedd5cf0 Update CMake project language settings for library samples 2024-12-16 16:46:57 -08:00
Rob Armstrong
d54d4d7419 Update supported architecture list for some samples 2024-12-16 16:35:23 -08:00
Rob Armstrong
03719b7623 Change remaining build targets to specify active SM variants 2024-12-16 16:17:14 -08:00
Rob Armstrong
11fc617794 Refactor CMakeLists.txt under 5_Domain_Specific 2024-12-16 16:10:56 -08:00
Rob Armstrong
7d7e0777a5 Update volumeFiltering, volumeRender, vulkanImageCUDA 2024-12-16 16:07:41 -08:00
Rob Armstrong
a60b4a984e Update SobelFilter, SobolQRNG, stereoDisparity 2024-12-16 15:55:04 -08:00
Rob Armstrong
7b0068a433 Update simpleVulkanMMAP, smokeParticles 2024-12-16 15:11:32 -08:00
Rob Armstrong
2ad2272e3d Change some applicable build architectures 2024-12-16 14:52:34 -08:00
Rob Armstrong
cbfab74480 Refactor CMakeLists.txt under 6_Performance 2024-12-16 14:52:10 -08:00
Rob Armstrong
00999c1789 Refactor CMakeLists.txt under 4_CUDA_Libraries 2024-12-16 14:38:36 -08:00
Rob Armstrong
090f957854 Refactor CMakeLists.txt under 3_CUDA_Features 2024-12-16 14:37:14 -08:00
Rob Armstrong
ea694a4b0d Refactor CMakeLists.txt under 2_Concepts_and_Techniques 2024-12-16 14:34:20 -08:00
Rob Armstrong
281daef279 Refactor CMakeLists.txt under 1_Utilities 2024-12-16 14:30:38 -08:00
Rob Armstrong
23928df4ff Change build target to specify active SM variants 2024-12-16 14:11:47 -08:00
Rob Armstrong
8d2e39c395 Make target compile options, language standards target-specific 2024-12-16 12:01:03 -08:00
Rob Armstrong
1bb070deba Make each CMakeLists.txt under 0_Introduction its own project 2024-12-16 09:23:32 -08:00
Rob Armstrong
07e5fc5473 Update simpleGL, simpleVulkan 2024-12-16 09:18:15 -08:00
Rob Armstrong
de204853cf Update postProcessGL, quasirandomGenerator, quasirandomGenerator_nvrtc 2024-12-14 10:58:26 -08:00
Rob Armstrong
3f0693b37e Update nbody, NV12toBGRandResize, p2pBandwidthlatencyTest 2024-12-14 10:49:04 -08:00
Rob Armstrong
4fc7cdb95c Update HSOpticalFlow, Mandelbrot 2024-12-14 10:40:33 -08:00
Rob Armstrong
ba77e8d23b Update fluidsGL, marchingCubes, MonteCarloMultiGPU 2024-12-14 10:28:20 -08:00
Rob Armstrong
10e9b975c4 Update fastWalshTransform, FDTD3d 2024-12-14 10:00:39 -08:00
Rob Armstrong
8b5c84cd22 Remove FDTD3d CMakeLists.txt entry 2024-12-13 16:11:27 -08:00
Rob Armstrong
6929eea2ed Update BlackScholes, BlackScholes_nvrtc, convolutionFFT2D, dwtHaar1D, dxtc 2024-12-13 16:09:49 -08:00
Rob Armstrong
cd485da765 Fix bicubicTexture CMakeLists.txt 2024-12-13 16:03:19 -08:00
Rob Armstrong
223e658beb Update bicubicTexture, bilateralFilter, binomialOptions, binomialOptions_nvrtc 2024-12-13 16:00:40 -08:00
Rob Armstrong
c9794ff283 Remove legacy fluidsD3D9 2024-12-13 15:37:58 -08:00
Rob Armstrong
035dcfd357 Remove legacy Direct3D 9 and 10 interoperability samples 2024-12-13 15:29:09 -08:00
Rob Armstrong
37922e6429 Update simpleCUFFT, simpleCUFFT_2d_MGPU, simpleCUFFT_callback, simpleCUFFT_MGPU 2024-12-13 14:47:58 -08:00
Rob Armstrong
05d2f991de Update simpleCUBLAS, simpleCUBLAS_LU, simpleCUBLASXT 2024-12-13 14:43:47 -08:00
Rob Armstrong
b6f3065605 Update oceanFFT, randomFog 2024-12-13 14:40:06 -08:00
Rob Armstrong
461fc3c649 Update nvJPEG, nvJPEG_encoder 2024-12-13 14:35:27 -08:00
Rob Armstrong
b1837c0e4e Update jitLto, lineOfSight, matrixMulCUBLAS, MersenneTwisterGP11213 2024-12-13 13:54:50 -08:00
Rob Armstrong
7fde420160 Update FilterBorderControlNPP, freeImageInteropNPP, histEqualizationNPP, watershedSegmentationNPP 2024-12-13 13:47:24 -08:00
Rob Armstrong
0d161038a2 Update cuSolver samples 2024-12-13 12:11:13 -08:00
Rob Armstrong
ee8ff3cf5b Move cuDLA, NVSci sampels to 8_Platform_Specific/Tegra 2024-12-13 11:58:52 -08:00
Rob Armstrong
7568673fa6 Update conjugate gradient samples 2024-12-13 11:45:05 -08:00
Rob Armstrong
4543e7bbab Update cannyEdgeDetectorNPP 2024-12-13 11:05:35 -08:00
Rob Armstrong
a3be0d3cd8 Update boxFilterNPP, delete batchedLabelMarkersAndLabelCompressionNPP 2024-12-13 10:38:24 -08:00
Rob Armstrong
89f2e5c0c3 Integrate libNVVM samples 2024-12-12 12:06:34 -08:00
Rob Armstrong
f93a9ab81c Update UnifiedMemoryPerf 2024-12-12 11:50:35 -08:00
Rob Armstrong
fb1eaa8323 Update alignedTypes, cudaGraphsPerfScaling, LargeKernelParameter, transpose 2024-12-12 11:48:07 -08:00
Rob Armstrong
2f826e305a Update ptxjit 2024-12-12 11:34:01 -08:00
Rob Armstrong
9bebdf7ef4 Update jacobiCudaGraphs, memMapIPCDrv, newdelete, simpleCudaGraphs, tf32TensorCoreGemm, warpAggregatedAtomicsCG 2024-12-12 11:29:48 -08:00
Rob Armstrong
6fd8228242 Update graphMemoryFootprint, graphMemoryNodes, immaTensorCoreGemm 2024-12-12 11:19:17 -08:00
Rob Armstrong
76210c84f7 Update cudaCompressibleMemory, cudaTensorCoreGemm, dmmaTensorCoreGemm, globalToShmemAsyncCopy, graphConditionalNodes 2024-12-12 11:17:35 -08:00
Rob Armstrong
62d32b38d7 Update bindlessTexture, CDP samples 2024-12-12 11:11:51 -08:00
Rob Armstrong
dd73281bc6 Update StreamPriorities, bf16TensorCoreGemm, binaryPartitionCG 2024-12-12 11:02:13 -08:00
Rob Armstrong
df5ff58ca4 Update EGL samples (NB EGLSync_CUDAEvent_Interop link error in debug) 2024-12-12 10:43:28 -08:00
Rob Armstrong
c7fe3b2f4c Update Monte Carlo pricing models 2024-12-12 09:29:50 -08:00
Rob Armstrong
cd51392c0f Update threadMigration, threadFenceReduction 2024-12-12 09:19:48 -08:00
Rob Armstrong
0eaf6d4198 Update shfl_scan, sortingNetworks, streamOrderedAllocation, streamOrderedAllocationIPC, streamOrderedAllocationP2P 2024-12-12 09:14:11 -08:00
Rob Armstrong
551f0dbebe Update scan, scalarProd, segmentationTreeThrust 2024-12-12 09:02:53 -08:00
Rob Armstrong
29bde74c86 Update radixSortThrust, reduction, reductionMultiBlockCG 2024-12-12 08:50:34 -08:00
Rob Armstrong
a60926da2c Update interval, particles 2024-12-12 08:46:51 -08:00
Rob Armstrong
039e445f93 Update imageDenoising, inlinePTX, inlinePTX_nvrtc 2024-12-12 08:22:20 -08:00
Rob Armstrong
3fd2fbce00 Update histogram, FunctionPointers 2024-12-12 08:14:59 -08:00
Rob Armstrong
b284c28ce5 Update eigenvalues 2024-12-11 18:08:52 -08:00
Rob Armstrong
74107d2da1 Update boxFilter, convolutionSeparable, convolutionTexture, dct8x8, delete cuHook 2024-12-11 18:01:43 -08:00
Rob Armstrong
12d096790b Update utility samples 2024-12-11 16:51:56 -08:00
Rob Armstrong
5fab4d73f5 Introductory README - remove removed samples 2024-12-11 16:12:55 -08:00
Rob Armstrong
8a2c175abc Update README (not final) 2024-12-11 16:10:13 -08:00
Rob Armstrong
0dc8ecc1f6 Update vector add samples 2024-12-11 16:06:48 -08:00
Rob Armstrong
a14d734664 Update template, simpleVoteIntrinsics - remove simpleVoteIntrinsics_nvrtc 2024-12-11 15:58:41 -08:00
Rob Armstrong
56ad17f97c Update simpleTexture3D, simpleTextureDrv 2024-12-11 15:55:04 -08:00
Rob Armstrong
7013a0b70a Update simpleTemplates, simpleTexture 2024-12-11 15:45:57 -08:00
Rob Armstrong
a461e61485 Remove simpleTemplates_nvrtc 2024-12-11 15:41:41 -08:00
Rob Armstrong
769a225af3 Remove simpleSeparateCompilation 2024-12-11 15:38:08 -08:00
Rob Armstrong
53a02af443 Uniquify fatbin generation target for matrixMulDrv 2024-12-11 15:25:53 -08:00
Rob Armstrong
9b70771583 Update simpleMPI, simpleDrvRuntime 2024-12-11 15:24:51 -08:00
Rob Armstrong
7ce058b479 Update simpleCallback, simpleCUDA2GL 2024-12-11 15:09:43 -08:00
Rob Armstrong
8663c8bf19 Update matrixMul_nvrtc (TODO: Fix search paths) 2024-12-11 22:55:27 +00:00
Rob Armstrong
2dfdd55e29 Update matrixMulDrv, matrixMulDynlinkJIT 2024-12-11 22:33:25 +00:00
Rob Armstrong
c52cfc339f Update UnifiedMemoryStreams, simpleStreams, simpleSurfaceWrite, simpleZeroCopy, systemWideAtomics 2024-12-11 20:56:37 +00:00
Rob Armstrong
bcf4a0dd31 Update simpleOccupancy, simpleP2P, simplePitchLinearTexture, simplePrintf 2024-12-11 20:44:47 +00:00
Rob Armstrong
9efe753a74 Update simpleHyperQ, simpleIPC, simpleLayeredTexture, simpleMultiCopy, simpleMultiGPU 2024-12-11 20:42:18 +00:00
Rob Armstrong
2dee482699 Update simpleAtomicIntrinsics_nvrtc 2024-12-11 20:35:09 +00:00
Rob Armstrong
29a2098575 Update simpleAtomicIntrinsics 2024-12-11 20:30:56 +00:00
Rob Armstrong
a0774d28f1 Update simpleAssert_nvrtc 2024-12-11 20:28:53 +00:00
Rob Armstrong
190d0cbaa7 Add missed CMakeLists 2024-12-11 20:24:08 +00:00
Rob Armstrong
e53fcad712 Update simpleAssert, simpleAttributes, simpleCooperativeGroups, simpleCubemapTexture 2024-12-11 20:23:04 +00:00
Rob Armstrong
d06e42bf06 Update simpleAWBarrier 2024-12-11 20:16:50 +00:00
Rob Armstrong
fd2c269ce3 Update mergeSort 2024-12-11 19:36:54 +00:00
Rob Armstrong
a1cd67ca87 Update matrixMul 2024-12-11 19:22:13 +00:00
Rob Armstrong
9a207a910a Update fp16ScalarProduct 2024-12-11 19:20:36 +00:00
Rob Armstrong
2a4d019282 Update cudaOpenMP 2024-12-11 19:18:12 +00:00
Rob Armstrong
8e03cbfcf9 Remove cppOverload 2024-12-11 19:12:34 +00:00
Rob Armstrong
9d8f61431e Remove cppIntegration 2024-12-11 19:11:40 +00:00
Rob Armstrong
ff264a798f Remove concurrentKernels 2024-12-11 17:21:00 +00:00
Rob Armstrong
274836a1a2 Update clock_nvrtc 2024-12-11 17:20:34 +00:00
Rob Armstrong
c15a0c4bbf Remove c++11_cuda sample 2024-12-11 16:46:54 +00:00
Rob Armstrong
912f37963a Add basic CMake framework 2024-12-11 16:46:14 +00:00
Rob Armstrong
82bcada84c Remove now-unnecessary Visual Studio project files 2024-12-11 16:25:06 +00:00
steffen-v
22424227e7
fix "gridy" comandline argument for initMC 2024-07-26 14:42:05 +02:00
Rob Nertney
9c688d7ff7 Updating samples for CUDA 12.5 2024-07-25 16:30:13 +00:00
Rob Nertney
5f97d7d0df Updating graphConditionalNodes orphan directory 2024-04-10 19:44:42 +00:00
Rob Nertney
3559ca4d08 Updating README with Confidential Computing notes 2024-03-05 21:01:35 +00:00
Rob Nertney
cd3bc1fa8e Updating samples for CUDA 12.4 2024-03-05 20:53:50 +00:00
Sangeet S
42ff742bf5
Merge pull request #1 from sangeetsatheesh/sangeetsatheesh-fix-typo
Fix typo #161
2024-01-17 13:16:53 -05:00
Sangeet S
8ccb13c6f0
Fix typo #161
Fix typo in line 14 from "simple exemple" to simple "example"
2024-01-17 13:16:01 -05:00
Rob Nertney
e8568c4173 Fixing jitlto regression, including missing cuDLA source files for bug #235, and updating changelogs 2023-11-09 16:52:00 +00:00
Rob Nertney
b5c84e6996 Updating Samples for 12.3 and updating props files 2023-10-23 18:44:49 +00:00
Rob Nertney
c46754b877 Update samples for 12.3 2023-10-20 17:38:48 +00:00
Rob Nertney
03309a2d42 Changelog updates 2023-06-29 19:33:40 +00:00
Rob Nertney
5688ee0013 Removing stray cpp from master 2023-05-31 17:48:13 +00:00
Rob Nertney
8004ad59ab Fix #194 and add Large Kernel Parameters Sample 2023-05-31 04:43:22 +00:00
Rob Nertney
e612904184
Merge pull request #182 from Wenlong-Zhu/master
Fix cudaExtent.width set error.
2023-03-27 20:53:45 -07:00
Rob Nertney
81cf058e30 Updating Samples for 12.1 2023-03-01 01:41:29 +00:00
Rob Nertney
26665bf33b Fixing README 2023-02-27 22:35:39 +00:00
Rob Nertney
00bb9bc367 Updating files for Ada architecture 2023-02-27 22:33:19 +00:00
Rob Nertney
e4789153d5 Updating License Header 2023-02-09 19:02:33 +00:00
Rob Nertney
1c2efac7c8 Adding SM number for Ada Architecture 2023-02-07 19:06:53 +00:00
Rob Nertney
3d553b2ea1 Adding JIT LTO Sample 2023-02-07 19:06:38 +00:00
wenlong-zhu
9316529638 Fix cudaExtent.width set error.
unit: 4_CUDA_Libraries/cudaNvSciNvMedia/cuda_consumer.cu
Because of the change of padding size in NvSciBuf,
the cudaExtent.width and cudaExtent.height should be change

Bug 3880762
2023-02-04 00:00:44 +08:00
Rob Nertney
2b689228b7 Updating samples for 12.0 2022-12-08 20:19:55 +00:00
Rob Nertney
81992093d2 Update samples for CUDA 11.8 with correct props 2022-10-14 17:43:37 -07:00
Rutwik Choughule
b312abaa07 add check for filename in nvrtc_helper.h 2022-02-03 18:12:24 +05:30
Rutwik Choughule
8f21b899b6 update dependency related links in README files 2022-01-27 17:58:13 +05:30
Rutwik Choughule
0cbe5f2d82 update makefiles to waive unsupported samples on QNX 2022-01-27 17:57:02 +05:30
Rutwik Choughule
805e60bdfc update lib path for conda 2022-01-27 17:55:38 +05:30
Rutwik Choughule
9d4c014f60 update sample cudaNvSci 2022-01-25 17:22:31 +05:30
Rutwik Choughule
bf8c6dd043 update lib path for conda 2022-01-14 02:31:40 +05:30
2926 changed files with 127291 additions and 371137 deletions

49
.clang-format Normal file
View File

@ -0,0 +1,49 @@
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveDeclarations: Consecutive
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Left
AlignOperands: AlignAfterOperator
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterExternBlock: true
AfterFunction: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
IndentBraces: false
BreakBeforeBraces: Custom
BreakBeforeConceptDeclarations: true
BreakBeforeBinaryOperators: NonAssignment
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
ColumnLimit: 120
DerivePointerAlignment: false
FixNamespaceComments: true
IncludeCategories:
- Regex: '^<.*>'
Priority: 1
- Regex: '^".*"'
Priority: 2
SortIncludes: true
IncludeBlocks: Regroup
IndentWidth: 4
MaxEmptyLinesToKeep: 2
PointerAlignment: Right
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
Standard: c++17
TabWidth: 4
UseTab: Never
...

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
build
.vs
.clangd
test
settings.json
launch.json

106
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,106 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
ci:
autofix_commit_msg: |
[pre-commit.ci] auto code formatting
autofix_prs: false
autoupdate_branch: ''
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
autoupdate_schedule: quarterly
skip: []
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$|
.*\.py$|
.*\.json$
)
- id: mixed-line-ending
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$|
.*\.py$|
.*\.json$
)
- id: trailing-whitespace
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$|
.*\.py$|
.*\.json$
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.6
hooks:
- id: clang-format
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$
)
exclude: |
(?x)^(
Common/.*
)
args: ["-fallback-style=none", "-style=file", "-i"]

View File

@ -1,5 +1,86 @@
## Changelog ## Changelog
### CUDA 12.9
* Updated toolchain for cross-compilation for Tegra Linux platforms.
* Added `run_tests.py` utility to exercise all samples. See README.md for details
* Repository has been updated with consistent code formatting across all samples
* Many small code tweaks and bug fixes (see commit history for details)
* Removed the following outdated samples:
* `1_Utilities`
* `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth
testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth)
### CUDA 12.8
* Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
* Removed the following outdated samples:
* `0_Introduction`
* `c++11_cuda` demonstrating CUDA and C++ 11 interoperability (reason: obsolete)
* `concurrentKernels` demonstrating the ability to run multiple kernels simultaneously (reason: obsolete)
* `cppIntegration` demonstrating calling between .cu and .cpp files (reason: obsolete)
* `cppOverload` demonstrating C++ function overloading (reason: obsolete)
* `simpleSeparateCompilation` demonstrating NVCC compilation to a static library (reason: trivial)
* `simpleTemplates_nvrtc` demonstrating NVRTC usage for `simpleTemplates` sample (reason: redundant)
* `simpleVoteIntrinsics_nvrtc` demonstrating NVRTC usage for `simpleVoteIntrinsics` sample (reason: redundant)
* `2_Concepts_and_Techniques`
* `cuHook` demonstrating dlsym hooks. (reason: incompatible with modern `glibc`)
* `4_CUDA_Libraries`
* `batchedLabelMarkersAndLabelCompressionNPP` demonstrating NPP features (reason: some functionality removed from library)
* `5_Domain_Specific`
* Legacy Direct3D 9 and 10 interoperability samples:
* `fluidsD3D9`
* `simpleD3D10`
* `simpleD3D10RenderTarget`
* `simpleD3D10Texture`
* `simpleD3D9`
* `simpleD3D9Texture`
* `SLID3D10Texture`
* `VFlockingD3D10`
* `8_Platform_Specific/Tegra`
* Temporarily removed the following two samples pending updates:
* `nbody_screen` demonstrating the nbody sample in QNX
* `simpleGLES_screen` demonstrating GLES interop in QNX
* Moved the following Tegra-specific samples to a dedicated subdirectory: `8_Platform_Specific/Tegra`
* `EGLSync_CUDAEvent_Interop`
* `cuDLAErrorReporting`
* `cuDLAHybridMode`
* `cuDLALayerwiseStatsHybrid`
* `cuDLALayerwiseStatsStandalone`
* `cuDLAStandaloneMode`
* `cudaNvSciBufMultiplanar`
* `cudaNvSciNvMedia`
* `fluidsGLES`
* `nbody_opengles`
* `simpleGLES`
* `simpleGLES_EGLOutput`
### CUDA 12.5
### CUDA 12.4
* Added graphConditionalNodes Sample
### CUDA 12.3
* Added cuDLA samples
* Fixed jitLto regression
### CUDA 12.2
* libNVVM samples received updates
* Fixed jitLto Case issues
* Enabled HOST_COMPILER flag to the makefiles for GCC which is untested but may still work.
### CUDA 12.1
* Added new sample for Large Kernels
### CUDA 12.0
* Added new flags for JIT compiling
* Removed deprecated APIs in Hopper Architecture
### CUDA 11.6
* Added new folder structure for samples
* Added support of Visual Studio 2022 to all samples supported on [Windows](#windows-1).
* All CUDA samples are now only available on [GitHub](https://github.com/nvidia/cuda-samples). They are no longer available via CUDA toolkit.
### CUDA 11.5 ### CUDA 11.5
* Added `cuDLAHybridMode`. Demonstrate usage of cuDLA in hybrid mode. * Added `cuDLAHybridMode`. Demonstrate usage of cuDLA in hybrid mode.
* Added `cuDLAStandaloneMode`. Demonstrate usage of cuDLA in standalone mode. * Added `cuDLAStandaloneMode`. Demonstrate usage of cuDLA in standalone mode.
@ -114,4 +195,4 @@ This is the first release of CUDA Samples on GitHub:
* Added `conjugateGradientMultiBlockCG`. Demonstrates a conjugate gradient solver on GPU using Multi Block Cooperative Groups. * Added `conjugateGradientMultiBlockCG`. Demonstrates a conjugate gradient solver on GPU using Multi Block Cooperative Groups.
* Added `conjugateGradientMultiDeviceCG`. Demonstrates a conjugate gradient solver on multiple GPUs using Multi Device Cooperative Groups, also uses unified memory prefetching and usage hints APIs. * Added `conjugateGradientMultiDeviceCG`. Demonstrates a conjugate gradient solver on multiple GPUs using Multi Device Cooperative Groups, also uses unified memory prefetching and usage hints APIs.
* Added `simpleCUBLAS`. Demonstrates how perform GEMM operations using CUBLAS library. * Added `simpleCUBLAS`. Demonstrates how perform GEMM operations using CUBLAS library.
* Added `simpleCUFFT`. Demonstrates how perform FFT operations using CUFFT library. * Added `simpleCUFFT`. Demonstrates how perform FFT operations using CUFFT library.

27
CMakeLists.txt Normal file
View File

@ -0,0 +1,27 @@
cmake_minimum_required(VERSION 3.20)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
project(cuda-samples LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
add_subdirectory(Samples)

103
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,103 @@
# Contributing to the CUDA Samples
Thank you for your interest in contributing to the CUDA Samples!
## Getting Started
1. **Fork & Clone the Repository**:
Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository).
## Making Changes
1. **Create a New Branch**:
```bash
git checkout -b your-feature-branch
```
2. **Make Changes**.
3. **Build and Test**:
Ensure changes don't break existing functionality by building and running tests.
For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below.
4. **Commit Changes**:
```bash
git commit -m "Brief description of the change"
```
## Building and Testing
For information on building a running tests on the samples, please refer to the main [README](README.md)
## Creating a Pull Request
1. Push changes to your fork
2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request.
3. Describe the purpose and context of the changes in the pull request description.
## Code Formatting (pre-commit hooks)
The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter
versions and options are aligned for all developers. Additionally, there is a CI check in place to
enforce that committed code follows our standards.
The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`.
For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
To use `pre-commit`, install via `conda` or `pip`:
```bash
conda config --add channels conda-forge
conda install pre-commit
```
```bash
pip install pre-commit
```
Then run pre-commit hooks before committing code:
```bash
pre-commit run
```
By default, pre-commit runs on staged files (only changes and additions that will be committed).
To run pre-commit checks on all files, execute:
```bash
pre-commit run --all-files
```
Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
```bash
pre-commit install
```
Now code linters and formatters will be run each time you commit changes.
You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note
that this may result in pull requests being rejected if subsequent checks fail.
## Review Process
Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code.
For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/).
Further recommended reading for successful PR reviews:
- [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/)
- [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/)
## Thank You
Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration!

View File

@ -1,294 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//--------------------------------------------------------------------------------------
// File: dynlink_d3d10.h
//
// Shortcut macros and functions for using DX objects
//
// Copyright (c) Microsoft Corporation. All rights reserved
//--------------------------------------------------------------------------------------
#ifndef _DYNLINK_D3D10_H_
#define _DYNLINK_D3D10_H_
// Standard Windows includes
#include <windows.h>
#include <initguid.h>
#include <assert.h>
#include <wchar.h>
#include <mmsystem.h>
#include <commctrl.h> // for InitCommonControls()
#include <shellapi.h> // for ExtractIcon()
#include <new.h> // for placement new
#include <shlobj.h>
#include <math.h>
#include <limits.h>
#include <stdio.h>
// CRT's memory leak detection
#if defined(DEBUG) || defined(_DEBUG)
#include <crtdbg.h>
#endif
// Direct3D9 includes
#include <d3d9.h>
// Direct3D10 includes
#include <dxgi.h>
#include <d3d10_1.h>
#include <d3d10.h>
// XInput includes
#include <xinput.h>
// strsafe.h deprecates old unsecure string functions. If you
// really do not want to it to (not recommended), then uncomment the next line
//#define STRSAFE_NO_DEPRECATE
#ifndef STRSAFE_NO_DEPRECATE
#pragma deprecated("strncpy")
#pragma deprecated("wcsncpy")
#pragma deprecated("_tcsncpy")
#pragma deprecated("wcsncat")
#pragma deprecated("strncat")
#pragma deprecated("_tcsncat")
#endif
#pragma warning( disable : 4996 ) // disable deprecated warning
#include <strsafe.h>
#pragma warning( default : 4996 )
#include <DirectXMath.h>
using namespace DirectX;
//--------------------------------------------------------------------------------------
// Structs
//--------------------------------------------------------------------------------------
struct DXUTD3D9DeviceSettings
{
UINT AdapterOrdinal;
D3DDEVTYPE DeviceType;
D3DFORMAT AdapterFormat;
DWORD BehaviorFlags;
D3DPRESENT_PARAMETERS pp;
};
struct DXUTD3D10DeviceSettings
{
UINT AdapterOrdinal;
D3D10_DRIVER_TYPE DriverType;
UINT Output;
DXGI_SWAP_CHAIN_DESC sd;
UINT32 CreateFlags;
UINT32 SyncInterval;
DWORD PresentFlags;
bool AutoCreateDepthStencil; // DXUT will create the a depth stencil resource and view if true
DXGI_FORMAT AutoDepthStencilFormat;
};
enum DXUTDeviceVersion { DXUT_D3D9_DEVICE, DXUT_D3D10_DEVICE };
struct DXUTDeviceSettings
{
DXUTDeviceVersion ver;
union
{
DXUTD3D9DeviceSettings d3d9; // only valid if ver == DXUT_D3D9_DEVICE
DXUTD3D10DeviceSettings d3d10; // only valid if ver == DXUT_D3D10_DEVICE
};
};
//--------------------------------------------------------------------------------------
// Error codes
//--------------------------------------------------------------------------------------
#define DXUTERR_NODIRECT3D MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0901)
#define DXUTERR_NOCOMPATIBLEDEVICES MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0902)
#define DXUTERR_MEDIANOTFOUND MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0903)
#define DXUTERR_NONZEROREFCOUNT MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0904)
#define DXUTERR_CREATINGDEVICE MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0905)
#define DXUTERR_RESETTINGDEVICE MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0906)
#define DXUTERR_CREATINGDEVICEOBJECTS MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0907)
#define DXUTERR_RESETTINGDEVICEOBJECTS MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x0908)
#define DXUTERR_DEVICEREMOVED MAKE_HRESULT(SEVERITY_ERROR, FACILITY_ITF, 0x090A)
typedef HRESULT(WINAPI *LPCREATEDXGIFACTORY)(REFIID, void **);
typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT, UINT32,
ID3D10Device **);
typedef HRESULT(WINAPI *LPD3D10CREATEDEVICE1)(IDXGIAdapter *, D3D10_DRIVER_TYPE, HMODULE, UINT,
D3D10_FEATURE_LEVEL1, UINT, ID3D10Device1 **);
typedef HRESULT(WINAPI *LPD3D10CREATESTATEBLOCK)(ID3D10Device *pDevice, D3D10_STATE_BLOCK_MASK *pStateBlockMask,
ID3D10StateBlock **ppStateBlock);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKUNION)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
D3D10_STATE_BLOCK_MASK *pResult);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKINTERSECT)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
D3D10_STATE_BLOCK_MASK *pResult);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDIFFERENCE)(D3D10_STATE_BLOCK_MASK *pA, D3D10_STATE_BLOCK_MASK *pB,
D3D10_STATE_BLOCK_MASK *pResult);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
UINT RangeLength);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLECAPTURE)(D3D10_STATE_BLOCK_MASK *pMask,
D3D10_DEVICE_STATE_TYPES StateType, UINT RangeStart,
UINT RangeLength);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKENABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
typedef HRESULT(WINAPI *LPD3D10STATEBLOCKMASKDISABLEALL)(D3D10_STATE_BLOCK_MASK *pMask);
typedef BOOL (WINAPI *LPD3D10STATEBLOCKMASKGETSETTING)(D3D10_STATE_BLOCK_MASK *pMask,
D3D10_DEVICE_STATE_TYPES StateType, UINT Entry);
typedef HRESULT(WINAPI *LPD3D10COMPILEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, LPCSTR pSrcFileName,
CONST D3D10_SHADER_MACRO *pDefines,
ID3D10Include *pInclude, UINT HLSLFlags, UINT FXFlags,
ID3D10Blob **ppCompiledEffect, ID3D10Blob **ppErrors);
typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
ID3D10Device *pDevice,
ID3D10EffectPool *pEffectPool,
ID3D10Effect **ppEffect);
typedef HRESULT(WINAPI *LPD3D10CREATEEFFECTPOOLFROMMEMORY)(void *pData, SIZE_T DataLength, UINT FXFlags,
ID3D10Device *pDevice, ID3D10EffectPool **ppEffectPool);
typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN)(IDXGIAdapter *pAdapter,
D3D10_DRIVER_TYPE DriverType,
HMODULE Software,
UINT Flags,
UINT SDKVersion,
DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
IDXGISwapChain **ppSwapChain,
ID3D10Device **ppDevice);
typedef HRESULT(WINAPI *LPD3D10CREATEDEVICEANDSWAPCHAIN1)(IDXGIAdapter *pAdapter,
D3D10_DRIVER_TYPE DriverType,
HMODULE Software,
UINT Flags,
D3D10_FEATURE_LEVEL1 HardwareLevel,
UINT SDKVersion,
DXGI_SWAP_CHAIN_DESC *pSwapChainDesc,
IDXGISwapChain **ppSwapChain,
ID3D10Device1 **ppDevice);
// Module and function pointers
static HMODULE g_hModDXGI = NULL;
static HMODULE g_hModD3D10 = NULL;
static HMODULE g_hModD3D101 = NULL;
static LPCREATEDXGIFACTORY sFnPtr_CreateDXGIFactory = NULL;
static LPD3D10CREATESTATEBLOCK sFnPtr_D3D10CreateStateBlock = NULL;
static LPD3D10CREATEDEVICE sFnPtr_D3D10CreateDevice = NULL;
static LPD3D10CREATEDEVICE1 sFnPtr_D3D10CreateDevice1 = NULL;
static LPD3D10STATEBLOCKMASKUNION sFnPtr_D3D10StateBlockMaskUnion = NULL;
static LPD3D10STATEBLOCKMASKINTERSECT sFnPtr_D3D10StateBlockMaskIntersect = NULL;
static LPD3D10STATEBLOCKMASKDIFFERENCE sFnPtr_D3D10StateBlockMaskDifference = NULL;
static LPD3D10STATEBLOCKMASKENABLECAPTURE sFnPtr_D3D10StateBlockMaskEnableCapture = NULL;
static LPD3D10STATEBLOCKMASKDISABLECAPTURE sFnPtr_D3D10StateBlockMaskDisableCapture = NULL;
static LPD3D10STATEBLOCKMASKENABLEALL sFnPtr_D3D10StateBlockMaskEnableAll = NULL;
static LPD3D10STATEBLOCKMASKDISABLEALL sFnPtr_D3D10StateBlockMaskDisableAll = NULL;
static LPD3D10STATEBLOCKMASKGETSETTING sFnPtr_D3D10StateBlockMaskGetSetting = NULL;
static LPD3D10COMPILEEFFECTFROMMEMORY sFnPtr_D3D10CompileEffectFromMemory = NULL;
static LPD3D10CREATEEFFECTFROMMEMORY sFnPtr_D3D10CreateEffectFromMemory = NULL;
static LPD3D10CREATEEFFECTPOOLFROMMEMORY sFnPtr_D3D10CreateEffectPoolFromMemory = NULL;
static LPD3D10CREATEDEVICEANDSWAPCHAIN sFnPtr_D3D10CreateDeviceAndSwapChain = NULL;
static LPD3D10CREATEDEVICEANDSWAPCHAIN1 sFnPtr_D3D10CreateDeviceAndSwapChain1 = NULL;
// unload the D3D10 DLLs
static bool dynlinkUnloadD3D10API(void)
{
if (g_hModD3D10)
{
FreeLibrary(g_hModD3D10);
g_hModD3D10 = NULL;
}
if (g_hModDXGI)
{
FreeLibrary(g_hModDXGI);
g_hModDXGI = NULL;
}
if (g_hModD3D101)
{
FreeLibrary(g_hModD3D101);
g_hModD3D101 = NULL;
}
return true;
}
// Dynamically load the D3D10 DLLs loaded and map the function pointers
static bool dynlinkLoadD3D10API(void)
{
// First check to see if the D3D10 Library is present.
// if it succeeds, then we can call GetProcAddress to grab all of the DX10 functions
g_hModD3D10 = LoadLibrary("d3d10.dll");
if (g_hModD3D10 != NULL)
{
sFnPtr_D3D10CreateStateBlock = (LPD3D10CREATESTATEBLOCK) GetProcAddress(g_hModD3D10, "D3D10CreateStateBlock");
sFnPtr_D3D10CreateDevice = (LPD3D10CREATEDEVICE) GetProcAddress(g_hModD3D10, "D3D10CreateDevice");
sFnPtr_D3D10StateBlockMaskUnion = (LPD3D10STATEBLOCKMASKUNION) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskUnion");
sFnPtr_D3D10StateBlockMaskIntersect = (LPD3D10STATEBLOCKMASKINTERSECT) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskIntersect");
sFnPtr_D3D10StateBlockMaskDifference = (LPD3D10STATEBLOCKMASKDIFFERENCE) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDifference");
sFnPtr_D3D10StateBlockMaskEnableCapture = (LPD3D10STATEBLOCKMASKENABLECAPTURE) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableCapture");
sFnPtr_D3D10StateBlockMaskDisableCapture = (LPD3D10STATEBLOCKMASKDISABLECAPTURE)GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableCapture");
sFnPtr_D3D10StateBlockMaskEnableAll = (LPD3D10STATEBLOCKMASKENABLEALL) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskEnableAll");
sFnPtr_D3D10StateBlockMaskDisableAll = (LPD3D10STATEBLOCKMASKDISABLEALL) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskDisableAll");
sFnPtr_D3D10StateBlockMaskGetSetting = (LPD3D10STATEBLOCKMASKGETSETTING) GetProcAddress(g_hModD3D10, "D3D10StateBlockMaskGetSetting");
sFnPtr_D3D10CompileEffectFromMemory = (LPD3D10COMPILEEFFECTFROMMEMORY) GetProcAddress(g_hModD3D10, "D3D10CompileEffectFromMemory");
sFnPtr_D3D10CreateEffectFromMemory = (LPD3D10CREATEEFFECTFROMMEMORY) GetProcAddress(g_hModD3D10, "D3D10CreateEffectFromMemory");
sFnPtr_D3D10CreateEffectPoolFromMemory = (LPD3D10CREATEEFFECTPOOLFROMMEMORY) GetProcAddress(g_hModD3D10, "D3D10CreateEffectPoolFromMemory");
sFnPtr_D3D10CreateDeviceAndSwapChain = (LPD3D10CREATEDEVICEANDSWAPCHAIN) GetProcAddress(g_hModD3D10, "D3D10CreateDeviceAndSwapChain");
}
g_hModDXGI = LoadLibrary("dxgi.dll");
if (g_hModDXGI)
{
sFnPtr_CreateDXGIFactory = (LPCREATEDXGIFACTORY) GetProcAddress(g_hModDXGI , "CreateDXGIFactory");
}
// This may fail if this machine isn't Windows Vista SP1 or later
g_hModD3D101 = LoadLibrary("d3d10_1.dll");
if (g_hModD3D101 != NULL)
{
sFnPtr_D3D10CreateDevice1 = (LPD3D10CREATEDEVICE1) GetProcAddress(g_hModD3D101, "D3D10CreateDevice1");
sFnPtr_D3D10CreateDeviceAndSwapChain1 = (LPD3D10CREATEDEVICEANDSWAPCHAIN1) GetProcAddress(g_hModD3D101, "D3D10CreateDeviceAndSwapChain1");
}
if (g_hModD3D10 == NULL || g_hModDXGI == NULL || g_hModD3D101 == NULL)
{
dynlinkUnloadD3D10API();
return false;
}
return true;
}
#endif

View File

@ -666,6 +666,11 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
{0x80, 64}, {0x80, 64},
{0x86, 128}, {0x86, 128},
{0x87, 128}, {0x87, 128},
{0x89, 128},
{0x90, 128},
{0xa0, 128},
{0xa1, 128},
{0xc0, 128},
{-1, -1}}; {-1, -1}};
int index = 0; int index = 0;
@ -712,6 +717,12 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
{0x75, "Turing"}, {0x75, "Turing"},
{0x80, "Ampere"}, {0x80, "Ampere"},
{0x86, "Ampere"}, {0x86, "Ampere"},
{0x87, "Ampere"},
{0x89, "Ada"},
{0x90, "Hopper"},
{0xa0, "Blackwell"},
{0xa1, "Blackwell"},
{0xc0, "Blackwell"},
{-1, "Graphics Device"}}; {-1, "Graphics Device"}};
int index = 0; int index = 0;

View File

@ -114,6 +114,11 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
{0x80, 64}, {0x80, 64},
{0x86, 128}, {0x86, 128},
{0x87, 128}, {0x87, 128},
{0x89, 128},
{0x90, 128},
{0xa0, 128},
{0xa1, 128},
{0xc0, 128},
{-1, -1}}; {-1, -1}};
int index = 0; int index = 0;
@ -236,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
} }
unsigned long long compute_perf = unsigned long long compute_perf =
(unsigned long long)(multiProcessorCount * sm_per_multiproc * ((unsigned long long)multiProcessorCount * sm_per_multiproc *
clockRate); clockRate);
if (compute_perf > max_compute_perf) { if (compute_perf > max_compute_perf) {
@ -403,4 +408,3 @@ bool inline findFatbinPath(const char *module_file, std::string &module_path, ch
// end of CUDA Helper Functions // end of CUDA Helper Functions
#endif // COMMON_HELPER_CUDA_DRVAPI_H_ #endif // COMMON_HELPER_CUDA_DRVAPI_H_

View File

@ -168,7 +168,7 @@ int waitProcess(Process *process) {
#endif #endif
} }
#if defined(__linux__) #if defined(__linux__) || defined(__QNX__)
int ipcCreateSocket(ipcHandle *&handle, const char *name, int ipcCreateSocket(ipcHandle *&handle, const char *name,
const std::vector<Process> &processes) { const std::vector<Process> &processes) {
int server_fd; int server_fd;
@ -262,41 +262,48 @@ int ipcRecvShareableHandle(ipcHandle *handle, ShareableHandle *shHandle) {
// Union to guarantee alignment requirements for control array // Union to guarantee alignment requirements for control array
union { union {
struct cmsghdr cm; struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))]; // This will not work on QNX as QNX CMSG_SPACE calls __cmsg_alignbytes
// And __cmsg_alignbytes is a runtime function instead of compile-time macros
// char control[CMSG_SPACE(sizeof(int))]
char* control;
} control_un; } control_un;
size_t sizeof_control = CMSG_SPACE(sizeof(int)) * sizeof(char);
control_un.control = (char*) malloc(sizeof_control);
struct cmsghdr *cmptr; struct cmsghdr *cmptr;
ssize_t n; ssize_t n;
int receivedfd; int receivedfd;
char dummy_buffer[1]; char dummy_buffer[1];
ssize_t sendResult; ssize_t sendResult;
msg.msg_control = control_un.control; msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control); msg.msg_controllen = sizeof_control;
iov[0].iov_base = (void *)dummy_buffer; iov[0].iov_base = (void *)dummy_buffer;
iov[0].iov_len = sizeof(dummy_buffer); iov[0].iov_len = sizeof(dummy_buffer);
msg.msg_iov = iov; msg.msg_iov = iov;
msg.msg_iovlen = 1; msg.msg_iovlen = 1;
if ((n = recvmsg(handle->socket, &msg, 0)) <= 0) { if ((n = recvmsg(handle->socket, &msg, 0)) <= 0) {
perror("IPC failure: Receiving data over socket failed"); perror("IPC failure: Receiving data over socket failed");
free(control_un.control);
return -1; return -1;
} }
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) && if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) &&
(cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) { (cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) { if ((cmptr->cmsg_level != SOL_SOCKET) || (cmptr->cmsg_type != SCM_RIGHTS)) {
free(control_un.control);
return -1; return -1;
} }
memmove(&receivedfd, CMSG_DATA(cmptr), sizeof(receivedfd)); memmove(&receivedfd, CMSG_DATA(cmptr), sizeof(receivedfd));
*(int *)shHandle = receivedfd; *(int *)shHandle = receivedfd;
} else { } else {
free(control_un.control);
return -1; return -1;
} }
free(control_un.control);
return 0; return 0;
} }
@ -340,9 +347,12 @@ int ipcSendShareableHandle(ipcHandle *handle,
union { union {
struct cmsghdr cm; struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))]; char* control;
} control_un; } control_un;
size_t sizeof_control = CMSG_SPACE(sizeof(int)) * sizeof(char);
control_un.control = (char*) malloc(sizeof_control);
struct cmsghdr *cmptr; struct cmsghdr *cmptr;
ssize_t readResult; ssize_t readResult;
struct sockaddr_un cliaddr; struct sockaddr_un cliaddr;
@ -360,7 +370,7 @@ int ipcSendShareableHandle(ipcHandle *handle,
int sendfd = (int)shareableHandles[data]; int sendfd = (int)shareableHandles[data];
msg.msg_control = control_un.control; msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control); msg.msg_controllen = sizeof_control;
cmptr = CMSG_FIRSTHDR(&msg); cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int)); cmptr->cmsg_len = CMSG_LEN(sizeof(int));
@ -380,9 +390,11 @@ int ipcSendShareableHandle(ipcHandle *handle,
ssize_t sendResult = sendmsg(handle->socket, &msg, 0); ssize_t sendResult = sendmsg(handle->socket, &msg, 0);
if (sendResult <= 0) { if (sendResult <= 0) {
perror("IPC failure: Sending data over socket failed"); perror("IPC failure: Sending data over socket failed");
free(control_un.control);
return -1; return -1;
} }
free(control_un.control);
return 0; return 0;
} }

View File

@ -84,7 +84,7 @@ int waitProcess(Process *process);
#define checkIpcErrors(ipcFuncResult) \ #define checkIpcErrors(ipcFuncResult) \
if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); } if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); }
#if defined(__linux__) #if defined(__linux__) || defined(__QNX__)
struct ipcHandle_st { struct ipcHandle_st {
int socket; int socket;
char *socketName; char *socketName;

View File

@ -421,6 +421,7 @@ inline char *sdkFindFilePath(const char *filename,
} }
// File not found // File not found
printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
return 0; return 0;
} }

View File

@ -258,7 +258,7 @@ namespace nv
s[2] = &r3[0]; s[2] = &r3[0];
s[3] = &r4[0]; s[3] = &r4[0];
register int i,j,p,jj; int i,j,p,jj;
for (i=0; i<4; i++) for (i=0; i<4; i++)
{ {

View File

@ -49,6 +49,11 @@
void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResult, void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResult,
size_t *cubinResultSize, int requiresCGheaders) { size_t *cubinResultSize, int requiresCGheaders) {
if (!filename) {
std::cerr << "\nerror: filename is empty for compileFileToCUBIN()!\n";
exit(1);
}
std::ifstream inputFile(filename, std::ifstream inputFile(filename,
std::ios::in | std::ios::binary | std::ios::ate); std::ios::in | std::ios::binary | std::ios::ate);
@ -111,7 +116,12 @@ void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResul
compileOptions = "--include-path="; compileOptions = "--include-path=";
std::string path = sdkFindFilePath(HeaderNames, argv[0]); char *strPath = sdkFindFilePath(HeaderNames, argv[0]);
if (!strPath) {
std::cerr << "\nerror: header file " << HeaderNames << " not found!\n";
exit(1);
}
std::string path = strPath;
if (!path.empty()) { if (!path.empty()) {
std::size_t found = path.find(HeaderNames); std::size_t found = path.find(HeaderNames);
path.erase(found); path.erase(found);
@ -120,6 +130,7 @@ void compileFileToCUBIN(char *filename, int argc, char **argv, char **cubinResul
"\nCooperativeGroups headers not found, please install it in %s " "\nCooperativeGroups headers not found, please install it in %s "
"sample directory..\n Exiting..\n", "sample directory..\n Exiting..\n",
argv[0]); argv[0]);
exit(1);
} }
compileOptions += path.c_str(); compileOptions += path.c_str();
compileParams[numCompileOptions] = reinterpret_cast<char *>( compileParams[numCompileOptions] = reinterpret_cast<char *>(

View File

@ -1,128 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
//
// Utility funcs to wrap up saving a surface or the back buffer as a PPM file
// In addition, wraps up a threshold comparision of two PPMs.
//
// These functions are designed to be used to implement an automated QA testing
// for SDK samples.
//
// Author: Bryan Dudash
// Email: sdkfeedback@nvidia.com
//
// Copyright (c) NVIDIA Corporation. All rights reserved.
////////////////////////////////////////////////////////////////////////////////
#include <helper_functions.h>
#include <rendercheck_d3d10.h>
HRESULT CheckRenderD3D10::ActiveRenderTargetToPPM(ID3D10Device *pDevice,
const char *zFileName) {
ID3D10RenderTargetView *pRTV = NULL;
pDevice->OMGetRenderTargets(1, &pRTV, NULL);
ID3D10Resource *pSourceResource = NULL;
pRTV->GetResource(&pSourceResource);
return ResourceToPPM(pDevice, pSourceResource, zFileName);
}
HRESULT CheckRenderD3D10::ResourceToPPM(ID3D10Device *pDevice,
ID3D10Resource *pResource,
const char *zFileName) {
D3D10_RESOURCE_DIMENSION rType;
pResource->GetType(&rType);
if (rType != D3D10_RESOURCE_DIMENSION_TEXTURE2D) {
printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
return E_FAIL;
}
ID3D10Texture2D *pSourceTexture = (ID3D10Texture2D *)pResource;
ID3D10Texture2D *pTargetTexture = NULL;
D3D10_TEXTURE2D_DESC desc;
pSourceTexture->GetDesc(&desc);
desc.BindFlags = 0;
desc.CPUAccessFlags = D3D10_CPU_ACCESS_READ;
desc.Usage = D3D10_USAGE_STAGING;
if (FAILED(pDevice->CreateTexture2D(&desc, NULL, &pTargetTexture))) {
printf(
"SurfaceToPPM: Unable to create target Texture resoruce! Aborting... "
"\n");
return E_FAIL;
}
pDevice->CopyResource(pTargetTexture, pSourceTexture);
D3D10_MAPPED_TEXTURE2D mappedTex2D;
pTargetTexture->Map(0, D3D10_MAP_READ, 0, &mappedTex2D);
// Need to convert from dx pitch to pitch=width
unsigned char *pPPMData = new unsigned char[desc.Width * desc.Height * 4];
for (unsigned int iHeight = 0; iHeight < desc.Height; iHeight++) {
memcpy(
&(pPPMData[iHeight * desc.Width * 4]),
(unsigned char *)(mappedTex2D.pData) + iHeight * mappedTex2D.RowPitch,
desc.Width * 4);
}
pTargetTexture->Unmap(0);
// Prepends the PPM header info and bumps byte data afterwards
sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
delete[] pPPMData;
pTargetTexture->Release();
return S_OK;
}
bool CheckRenderD3D10::PPMvsPPM(const char *src_file, const char *ref_file,
const char *exec_path, const float epsilon,
const float threshold) {
char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
if (ref_file_path == NULL) {
printf(
"CheckRenderD3D10::PPMvsPPM unable to find <%s> in <%s> Aborting "
"comparison!\n",
ref_file, exec_path);
printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
ref_file);
printf("Aborting comparison!\n");
printf(" FAILURE!\n");
return false;
}
return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
true);
}

View File

@ -1,53 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#ifndef _RENDERCHECK_D3D10_H_
#define _RENDERCHECK_D3D10_H_
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <d3d10.h>
class CheckRenderD3D10 {
public:
CheckRenderD3D10() {}
static HRESULT ActiveRenderTargetToPPM(ID3D10Device *pDevice,
const char *zFileName);
static HRESULT ResourceToPPM(ID3D10Device *pDevice, ID3D10Resource *pResource,
const char *zFileName);
static bool PPMvsPPM(const char *src_file, const char *ref_file,
const char *exec_path, const float epsilon,
const float threshold = 0.0f);
};
#endif

View File

@ -1,167 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
////////////////////////////////////////////////////////////////////////////////
//
// Utility funcs to wrap up savings a surface or the back buffer as a PPM file
// In addition, wraps up a threshold comparision of two PPMs.
//
// These functions are designed to be used to implement an automated QA testing
// for SDK samples.
//
// Author: Bryan Dudash
// Email: sdkfeedback@nvidia.com
//
// Copyright (c) NVIDIA Corporation. All rights reserved.
////////////////////////////////////////////////////////////////////////////////
#include <helper_functions.h>
#include <rendercheck_d3d9.h>
// originally copied from checkrender_gl.cpp and slightly modified
bool CheckRenderD3D9::PPMvsPPM(const char *src_file, const char *ref_file,
const char *exec_path, const float epsilon,
const float threshold) {
char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
if (ref_file_path == NULL) {
printf(
"CheckRenderD3D9::PPMvsPPM unable to find <%s> in <%s> Aborting "
"comparison!\n",
ref_file, exec_path);
printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
ref_file);
printf("Aborting comparison!\n");
printf(" FAILURE!\n");
return false;
}
return (sdkComparePPM(src_file, ref_file_path, epsilon, threshold, true) ==
true);
};
HRESULT CheckRenderD3D9::BackbufferToPPM(IDirect3DDevice9 *pDevice,
const char *zFileName) {
IDirect3DSurface9 *pSurface = NULL;
if (FAILED(
pDevice->GetBackBuffer(0, 0, D3DBACKBUFFER_TYPE_MONO, &pSurface))) {
printf("Unable to get the back buffer. Aborting...\n");
return E_FAIL;
}
// D3DXSaveSurfaceToFile("C:\\bing.dds",D3DXIFF_DDS,pSurface,NULL,NULL);
HRESULT hr = S_OK;
hr = SurfaceToPPM(pDevice, pSurface, zFileName);
pSurface->Release();
return hr;
}
HRESULT CheckRenderD3D9::SurfaceToPPM(IDirect3DDevice9 *pDevice,
IDirect3DSurface9 *pSurface,
const char *zFileName) {
D3DSURFACE_DESC pDesc;
pSurface->GetDesc(&pDesc);
// $$ For now only support common 8bit formats. TODO: support for more
// complex formats via conversion?
if (!(pDesc.Format == D3DFMT_A8R8G8B8 || pDesc.Format == D3DFMT_X8R8G8B8)) {
return E_INVALIDARG;
}
IDirect3DTexture9 *pTargetTex = NULL;
if (FAILED(pDevice->CreateTexture(pDesc.Width, pDesc.Height, 1,
D3DUSAGE_DYNAMIC, pDesc.Format,
D3DPOOL_SYSTEMMEM, &pTargetTex, NULL))) {
printf("Unable to create texture for surface transfer! Aborting...\n");
return E_FAIL;
}
IDirect3DSurface9 *pTargetSurface = NULL;
if (FAILED(pTargetTex->GetSurfaceLevel(0, &pTargetSurface))) {
printf("Unable to get surface for surface transfer! Aborting...\n");
return E_FAIL;
}
// This is required because we cannot lock a D3DPOOL_DEAULT surface directly.
// So, we copy to our sysmem surface.
if (FAILED(pDevice->GetRenderTargetData(pSurface, pTargetSurface))) {
printf(
"Unable to GetRenderTargetData() for surface transfer! Aborting...\n");
return E_FAIL;
}
D3DLOCKED_RECT lockedRect;
HRESULT hr = pTargetSurface->LockRect(&lockedRect, NULL, 0);
// Need to convert from dx pitch to pitch=width
//
// $ PPM is BGR and not RGB it seems. Saved image looks "funny" in viewer(red
// and blue swapped), but since ref will be dumped using same method, this is
// ok.
// however, if we want the saved image to be properly colored, then we
// can swizzle the color bytes here.
unsigned char *pPPMData = new unsigned char[pDesc.Width * pDesc.Height * 4];
for (unsigned int iHeight = 0; iHeight < pDesc.Height; iHeight++) {
#if 1 // swizzle to implment RGB to BGR conversion.
for (unsigned int iWidth = 0; iWidth < pDesc.Width; iWidth++) {
DWORD color = *(DWORD *)((unsigned char *)(lockedRect.pBits) +
iHeight * lockedRect.Pitch + iWidth * 4);
// R<->B, [7:0] <-> [23:16], swizzle
color = ((color & 0xFF) << 16) | (color & 0xFF00) |
((color & 0xFF0000) >> 16) | (color & 0xFF000000);
memcpy(&(pPPMData[(iHeight * pDesc.Width + iWidth) * 4]),
(unsigned char *)&color, 4);
}
#else
memcpy(&(pPPMData[iHeight * pDesc.Width * 4]),
(unsigned char *)(lockedRect.pBits) + iHeight * lockedRect.Pitch,
pDesc.Width * 4);
#endif
}
pTargetSurface->UnlockRect();
// Prepends the PPM header info and bumps byte data afterwards
sdkSavePPM4ub(zFileName, pPPMData, pDesc.Width, pDesc.Height);
delete[] pPPMData;
pTargetSurface->Release();
pTargetTex->Release();
return S_OK;
}

View File

@ -1,54 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#ifndef _RENDERCHECK_D3D9_H_
#define _RENDERCHECK_D3D9_H_
#include <assert.h>
#include <d3d9.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
class CheckRenderD3D9 {
public:
CheckRenderD3D9() {}
static HRESULT BackbufferToPPM(IDirect3DDevice9 *pDevice,
const char *zFileName);
static HRESULT SurfaceToPPM(IDirect3DDevice9 *pDevice,
IDirect3DSurface9 *pSurface,
const char *zFileName);
static bool PPMvsPPM(const char *src_file, const char *ref_file,
const char *exec_path, const float epsilon,
const float threshold = 0.0f);
};
#endif

330
README.md
View File

@ -1,23 +1,20 @@
# CUDA Samples # CUDA Samples
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads). Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
## Release Notes ## Release Notes
This section describes the release notes for the CUDA Samples on GitHub only. This section describes the release notes for the CUDA Samples on GitHub only.
### CUDA 11.6 ### Change Log
* Added new folder structure for samples
* Added support of Visual Studio 2022 to all samples supported on [Windows](#windows-1).
* All CUDA samples are now only available on [GitHub](https://github.com/nvidia/cuda-samples). They are no longer available via CUDA toolkit.
### [older versions...](./CHANGELOG.md) ### [Revision History](./CHANGELOG.md)
## Getting Started ## Getting Started
### Prerequisites ### Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html). For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
### Getting the CUDA Samples ### Getting the CUDA Samples
@ -31,43 +28,278 @@ Without using git the easiest way to use these samples is to download the zip fi
## Building CUDA Samples ## Building CUDA Samples
### Windows ### Building CUDA Samples
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: The CUDA Samples are built using CMake. Follow the instructions below for building on Linux, Windows, and for cross-compilation to Tegra devices.
```
*_vs<version>.sln - for Visual Studio <version>
```
Complete samples solution files exist at parent directory of the repo:
Each individual sample has its own set of solution files at:
`<CUDA_SAMPLES_REPO>\Samples\<sample_dir>\`
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
### Linux ### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details on cross platform compilation of cuda samples.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. Ensure that CMake (version 3.20 or later) is installed. Install it using your package manager if necessary:
```
$ make HOST_COMPILER=g++ e.g.
``` ```sudo apt install cmake```
Navigate to the root of the cloned repository and create a build directory:
```
mkdir build && cd build
```
Configure the project with CMake:
```
cmake ..
```
Build the samples:
```
make -j$(nproc)
```
Run the samples from their respective directories in the build folder. You can also follow this process from and subdirectory of the samples repo, or from within any individual sample.
### Windows
Language services for CMake are available in Visual Studio 2019 version 16.5 or later, and you can directly import the CUDA samples repository from either the root level or from any
subdirectory or individual sample.
To build from the command line, open the `x64 Native Tools Command Prompt for VS` provided with your Visual Studio installation.
Navigate to the root of the cloned repository and create a build directory:
```
mkdir build && cd build
```
Configure the project with CMake - for example:
```
cmake .. -G "Visual Studio 16 2019" -A x64
```
Open the generated solution file CUDA_Samples.sln in Visual Studio. Build the samples by selecting the desired configuration (e.g., Debug or Release) and pressing F7 (Build Solution).
Run the samples from the output directories specified in Visual Studio.
### Enabling On-GPU Debugging
NVIDIA GPUs support on-GPU debugging through cuda-gdb. Enabling this may significantly affect application performance as certain compiler optimizations are disabled
in this configuration, hence it's not on by default. Enablement of on-device debugging is controlled via the `-G` switch to nvcc.
To enable cuda-gdb for samples builds, define the `ENABLE_CUDA_DEBUG` flag on the CMake command line. For example:
```
cmake -DENABLE_CUDA_DEBUG=True ...
```
### Platform-Specific Samples
Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
* `BUILD_TEGRA` - for Tegra-specific samples
To build these samples, set the variables either on the command line or through your CMake GUI. For example:
```
cmake -DBUILD_TEGRA=True ..
```
### Cross-Compilation for Tegra Platforms
Install the NVIDIA toolchain and cross-compilation environment for Tegra devices as described in the Tegra Development Guide.
Ensure that CMake (version 3.20 or later) is installed.
Navigate to the root of the cloned repository and create a build directory:
```
mkdir build && cd build
```
Configure the project with CMake, specifying the Tegra toolchain file. And you can use -DTARGET_FS to point to the target file system root path for necessary include and library files:
```
cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake -DTARGET_FS=/path/to/target/system/file/system
```
Build the samples:
```
make -j$(nproc)
```
Transfer the built binaries to the Tegra device and execute them there.
### Building for Automotive Linux Platforms
These platforms require additional information to be passed to CMake on the command line to ensure proper resolution of all necessary include and library files.
Instead of being in the default location, `/usr/local/cuda/include` or `/usr/local/cuda/lib64`, you must point to architecture-specific paths:
`/usr/local/cuda/<ARCH>/targets/aarch64-linux/lib`
and
`/usr/local/cuda/<ARCH>/include`
An example build might look like this:
```
$ mkdir build
$ cd build
$ cmake -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_LIBRARY_PATH=/usr/local/cuda/orin/lib64/ -DCMAKE_INCLUDE_PATH=/usr/local/cuda/orin/include -DBUILD_TEGRA=True ..
```
### QNX
Note that in the current branch sample cross-compilation for QNX is not fully validated. This placeholder will be updated in the
near future with QNX cross-compilation instructions. In the meantime, if you want to cross-compile for QNX please check out one
of the previous tags prior to the CMake build system transition in 12.8.
## Running All Samples as Tests
It's important to note that the CUDA samples are _not_ intended as a validation suite for CUDA. They do not cover corner cases, they do not completely cover the
runtime and driver APIs, are not intended for performance benchmarking, etc. That said, it can sometimes be useful to run all of the samples as a quick sanity check and
we provide a script to do so, `run_tests.py`.
This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts
the following command line arguments:
| Switch | Purpose | Example |
| ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- |
| --dir | Specify the root directory to search for executables (recursively) | --dir ./build/Samples |
| --config | JSON configuration file for executable arguments | --config test_args.json |
| --output | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test |
| --args | Global arguments to pass to all executables (not currently used) | --args arg_1 arg_2 ... |
| --parallel | Number of applications to execute in parallel. | --parallel 8 |
Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows).
The script returns 0 on success, or the first non-zero error code encountered during testing on failure. It will also print a condensed list of samples that failed, if any.
There are three primary modes of configuration:
**Skip**
An executable configured with "skip" will not be executed. These generally rely on having attached graphical displays and are not suited to this kind of automation.
Configuration example:
```json
"fluidsGL": {
"skip": true
}
```
You will see:
```
Skipping fluidsGL (marked as skip in config)
```
**Single Run**
For executables to run one time only with arguments, specify each argument as a list entry. Each entry in the JSON file will be appended to the command line, separated
by a space.
All applications execute from their current directory, so all paths are relative to the application's location.
Note that if an application needs no arguments, this entry is optional. An executable found without a matching entry in the JSON will just run as `./application` from its
current directory.
Configuration example:
```json
"ptxgen": {
"args": [
"test.ll",
"-arch=compute_75"
]
}
```
You will see:
```
Running ptxgen
Command: ./ptxgen test.ll -arch=compute_75
Test completed with return code 0
```
**Multiple Runs**
For executables to run multiple times with different command line arguments, specify any number of sets of args within a "runs" list.
As with single runs, all applications execute from their current directory, so all paths are relative to the application's location.
Configuration example:
```json
"recursiveGaussian": {
"runs": [
{
"args": [
"-sigma=10",
"-file=data/ref_10.ppm"
]
},
{
"args": [
"-sigma=14",
"-file=data/ref_14.ppm"
]
},
{
"args": [
"-sigma=18",
"-file=data/ref_18.ppm"
]
},
{
"args": [
"-sigma=22",
"-file=data/ref_22.ppm"
]
}
]
}
```
You will see:
```
Running recursiveGaussian (run 1/4)
Command: ./recursiveGaussian -sigma=10 -file=data/ref_10.ppm
Test completed with return code 0
Running recursiveGaussian (run 2/4)
Command: ./recursiveGaussian -sigma=14 -file=data/ref_14.ppm
Test completed with return code 0
Running recursiveGaussian (run 3/4)
Command: ./recursiveGaussian -sigma=18 -file=data/ref_18.ppm
Test completed with return code 0
Running recursiveGaussian (run 4/4)
Command: ./recursiveGaussian -sigma=22 -file=data/ref_22.ppm
Test completed with return code 0
```
### Example Usage
Here is an example set of commands to build and test all of the samples.
First, build:
```bash
mkdir build
cd build
cmake ..
make -j$(nproc)
```
Now, return to the samples root directory and run the test script:
```bash
cd ..
python3 run_tests.py --output ./test --dir ./build/Samples --config test_args.json
```
If all applications run successfully, you will see something similar to this (the specific number of samples will depend on your build type
and system configuration):
```
Test Summary:
Ran 199 test runs for 180 executables.
All test runs passed!
```
If some samples fail, you will see something like this:
```
Test Summary:
Ran 199 test runs for 180 executables.
Failed runs (2):
bicubicTexture (run 1/5): Failed (code 1)
Mandelbrot (run 1/2): Failed (code 1)
```
You can inspect the stdout logs in the output directory (generally `APM_<application_name>.txt` or `APM_<application_name>.run<n>.txt`) to help
determine what may have gone wrong from the output logs. Please file issues against the samples repository if you believe a sample is failing
incorrectly on your system.
## Samples list ## Samples list
@ -92,6 +324,9 @@ Samples that are specific to domain (Graphics, Finance, Image Processing).
### [6. Performance](./Samples/6_Performance/README.md) ### [6. Performance](./Samples/6_Performance/README.md)
Samples that demonstrate performance optimization. Samples that demonstrate performance optimization.
### [7. libNVVM](./Samples/7_libNVVM/README.md)
Samples that demonstrate the use of libNVVVM and NVVM IR.
## Dependencies ## Dependencies
Some CUDA Samples rely on third-party applications and/or libraries, or features provided by the CUDA Toolkit and Driver, to either build or execute. These dependencies are listed below. Some CUDA Samples rely on third-party applications and/or libraries, or features provided by the CUDA Toolkit and Driver, to either build or execute. These dependencies are listed below.
@ -108,7 +343,7 @@ These third-party dependencies are required by some CUDA samples. If available,
FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website. FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.
To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `../../../Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to root level `bin/win64/Debug` and `bin/win64/Release` folder. To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `./Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFreeImage_INCLUDE_DIR` and `-DFreeImage_LIBRARY` options.
#### Message Passing Interface #### Message Passing Interface
@ -138,9 +373,14 @@ OpenGL ES is an embedded systems graphics library used for 2D and 3D rendering.
Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan targets high-performance realtime 3D graphics applications such as video games and interactive media across all platforms. On systems which support Vulkan, NVIDIA's Vulkan implementation is provided with the CUDA Driver. For building and running Vulkan applications one needs to install the [Vulkan SDK](https://www.lunarg.com/vulkan-sdk/). Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan targets high-performance realtime 3D graphics applications such as video games and interactive media across all platforms. On systems which support Vulkan, NVIDIA's Vulkan implementation is provided with the CUDA Driver. For building and running Vulkan applications one needs to install the [Vulkan SDK](https://www.lunarg.com/vulkan-sdk/).
#### GLFW
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
#### OpenMP #### OpenMP
OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). For compilers such as clang, `libomp.so` and other components for LLVM must be installed separated. You will also need to set additional flags in your CMake configuration files, such as: `-DOpenMP_CXX_FLAGS="-fopenmp=libomp" -DOpenMP_CXX_LIB_NAMES="omp" -DOpenMP_omp_LIBRARY="/path/to/libomp.so"`.
#### Screen #### Screen
@ -246,6 +486,10 @@ FP16 is a 16-bit floating-point format. One bit is used for the sign, five bits
NVCC support of [C++11 features](https://en.wikipedia.org/wiki/C++11). NVCC support of [C++11 features](https://en.wikipedia.org/wiki/C++11).
#### CMake
The libNVVM samples are built using [CMake](https://cmake.org/) 3.10 or later.
## Contributors Guide ## Contributors Guide
We welcome your input on issues and suggestions for samples. At this time we are not accepting contributions from the public, check back here as we evolve our contribution model. We welcome your input on issues and suggestions for samples. At this time we are not accepting contributions from the public, check back here as we evolve our contribution model.
@ -263,4 +507,4 @@ Answers to frequently asked questions about CUDA can be found at http://develope
## Attributions ## Attributions
* Teapot image is obtained from [Wikimedia](https://en.wikipedia.org/wiki/File:Original_Utah_Teapot.jpg) and is licensed under the Creative Commons [Attribution-Share Alike 2.0](https://creativecommons.org/licenses/by-sa/2.0/deed.en) Generic license. The image is modified for samples use cases. * Teapot image is obtained from [Wikimedia](https://en.wikipedia.org/wiki/File:Original_Utah_Teapot.jpg) and is licensed under the Creative Commons [Attribution-Share Alike 2.0](https://creativecommons.org/licenses/by-sa/2.0/deed.en) Generic license. The image is modified for samples use cases.

View File

@ -0,0 +1,46 @@
add_subdirectory(UnifiedMemoryStreams)
add_subdirectory(asyncAPI)
add_subdirectory(clock)
add_subdirectory(clock_nvrtc)
add_subdirectory(cudaOpenMP)
add_subdirectory(fp16ScalarProduct)
add_subdirectory(matrixMul)
add_subdirectory(matrixMulDrv)
add_subdirectory(matrixMulDynlinkJIT)
add_subdirectory(matrixMul_nvrtc)
add_subdirectory(mergeSort)
add_subdirectory(simpleAWBarrier)
add_subdirectory(simpleAssert)
add_subdirectory(simpleAssert_nvrtc)
add_subdirectory(simpleAtomicIntrinsics)
add_subdirectory(simpleAtomicIntrinsics_nvrtc)
add_subdirectory(simpleAttributes)
add_subdirectory(simpleCUDA2GL)
add_subdirectory(simpleCallback)
add_subdirectory(simpleCooperativeGroups)
add_subdirectory(simpleCubemapTexture)
add_subdirectory(simpleDrvRuntime)
add_subdirectory(simpleHyperQ)
add_subdirectory(simpleIPC)
add_subdirectory(simpleLayeredTexture)
add_subdirectory(simpleMPI)
add_subdirectory(simpleMultiCopy)
add_subdirectory(simpleMultiGPU)
add_subdirectory(simpleOccupancy)
add_subdirectory(simpleP2P)
add_subdirectory(simplePitchLinearTexture)
add_subdirectory(simplePrintf)
add_subdirectory(simpleStreams)
add_subdirectory(simpleSurfaceWrite)
add_subdirectory(simpleTemplates)
add_subdirectory(simpleTexture)
add_subdirectory(simpleTexture3D)
add_subdirectory(simpleTextureDrv)
add_subdirectory(simpleVoteIntrinsics)
add_subdirectory(simpleZeroCopy)
add_subdirectory(template)
add_subdirectory(systemWideAtomics)
add_subdirectory(vectorAdd)
add_subdirectory(vectorAddDrv)
add_subdirectory(vectorAddMMAP)
add_subdirectory(vectorAdd_nvrtc)

View File

@ -4,24 +4,12 @@
### [asyncAPI](./asyncAPI) ### [asyncAPI](./asyncAPI)
This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks. This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks.
### [c++11_cuda](./c++11_cuda)
This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters.
### [clock](./clock) ### [clock](./clock)
This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately. This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately.
### [clock_nvrtc](./clock_nvrtc) ### [clock_nvrtc](./clock_nvrtc)
This example shows how to use the clock function using libNVRTC to measure the performance of block of threads of a kernel accurately. This example shows how to use the clock function using libNVRTC to measure the performance of block of threads of a kernel accurately.
### [concurrentKernels](./concurrentKernels)
This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
### [cppIntegration](./cppIntegration)
This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.
### [cppOverload](./cppOverload)
This sample demonstrates how to use C++ function overloading on the GPU.
### [cudaOpenMP](./cudaOpenMP) ### [cudaOpenMP](./cudaOpenMP)
This sample demonstrates how to use OpenMP API to write an application for multiple GPUs. This sample demonstrates how to use OpenMP API to write an application for multiple GPUs.
@ -106,9 +94,6 @@ Use of Pitch Linear Textures
### [simplePrintf](./simplePrintf) ### [simplePrintf](./simplePrintf)
This basic CUDA Runtime API sample demonstrates how to use the printf function in the device code. This basic CUDA Runtime API sample demonstrates how to use the printf function in the device code.
### [simpleSeparateCompilation](./simpleSeparateCompilation)
This sample demonstrates a CUDA 5.0 feature, the ability to create a GPU device static library and use it within another CUDA kernel. This example demonstrates how to pass in a GPU device function (from the GPU device static library) as a function pointer to be called. This sample requires devices with compute capability 2.0 or higher.
### [simpleStreams](./simpleStreams) ### [simpleStreams](./simpleStreams)
This sample uses CUDA streams to overlap kernel executions with memory copies between the host and a GPU device. This sample uses a new CUDA 4.0 feature that supports pinning of generic host memory. Requires Compute Capability 2.0 or higher. This sample uses CUDA streams to overlap kernel executions with memory copies between the host and a GPU device. This sample uses a new CUDA 4.0 feature that supports pinning of generic host memory. Requires Compute Capability 2.0 or higher.
@ -118,9 +103,6 @@ Simple example that demonstrates the use of 2D surface references (Write-to-Text
### [simpleTemplates](./simpleTemplates) ### [simpleTemplates](./simpleTemplates)
This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays. This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays.
### [simpleTemplates_nvrtc](./simpleTemplates_nvrtc)
This sample is a templatized version of the template project. It also shows how to correctly templatize dynamically allocated shared memory arrays.
### [simpleTexture](./simpleTexture) ### [simpleTexture](./simpleTexture)
Simple example that demonstrates use of Textures in CUDA. Simple example that demonstrates use of Textures in CUDA.
@ -133,9 +115,6 @@ Simple example that demonstrates use of Textures in CUDA. This sample uses the
### [simpleVoteIntrinsics](./simpleVoteIntrinsics) ### [simpleVoteIntrinsics](./simpleVoteIntrinsics)
Simple program which demonstrates how to use the Vote (__any_sync, __all_sync) intrinsic instruction in a CUDA kernel. Simple program which demonstrates how to use the Vote (__any_sync, __all_sync) intrinsic instruction in a CUDA kernel.
### [simpleVoteIntrinsics_nvrtc](./simpleVoteIntrinsics_nvrtc)
Simple program which demonstrates how to use the Vote (any, all) intrinsic instruction in a CUDA kernel with runtime compilation using NVRTC APIs. Requires Compute Capability 2.0 or higher.
### [simpleZeroCopy](./simpleZeroCopy) ### [simpleZeroCopy](./simpleZeroCopy)
This sample illustrates how to use Zero MemCopy, kernels can read and write directly to pinned system memory. This sample illustrates how to use Zero MemCopy, kernels can read and write directly to pinned system memory.
@ -159,4 +138,3 @@ This Vector Addition sample is a basic sample that is implemented element by ele
### [vectorAddMMAP](./vectorAddMMAP) ### [vectorAddMMAP](./vectorAddMMAP)
This sample replaces the device allocation in the vectorAddDrv sample with cuMemMap-ed allocations. This sample demonstrates that the cuMemMap api allows the user to specify the physical properties of their memory while retaining the contiguous nature of their access, thus not requiring a change in their program structure. This sample replaces the device allocation in the vectorAddDrv sample with cuMemMap-ed allocations. This sample demonstrates that the cuMemMap api allows the user to specify the physical properties of their memory while retaining the contiguous nature of their access, thus not requiring a change in their program structure.

View File

@ -0,0 +1,45 @@
cmake_minimum_required(VERSION 3.20)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
project(UnifiedMemoryStreams LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
include_directories(../../../Common)
# Source file
if(CMAKE_GENERATOR MATCHES "Visual Studio")
find_package(OpenMP REQUIRED C CXX)
else()
find_package(OpenMP REQUIRED)
endif()
if(${OpenMP_FOUND})
# Add target for UnifiedMemoryStreams
add_executable(UnifiedMemoryStreams UnifiedMemoryStreams.cu)
target_compile_options(UnifiedMemoryStreams PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_features(UnifiedMemoryStreams PRIVATE cxx_std_17 cuda_std_17)
set_target_properties(UnifiedMemoryStreams PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_link_libraries(UnifiedMemoryStreams PUBLIC
CUDA::cublas
OpenMP::OpenMP_CXX
)
else()
message(STATUS "OpenMP not found - will not build sample 'UnifiedMemoryStreams'")
endif()

View File

@ -1,381 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - UnifiedMemoryStreams is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Attempt to compile a minimal OpenMP application. If a.out exists, OpenMP is properly set up.
ifneq (,$(filter $(TARGET_OS),linux android))
ifneq (,$(filter $(TARGET_OS), android))
LIBRARIES += -lomp
else
LIBRARIES += -lgomp
endif
ALL_CCFLAGS += -Xcompiler -fopenmp
$(shell echo "#include <omp.h>" > test.c ; echo "int main() { omp_get_num_threads(); return 0; }" >> test.c ; $(HOST_COMPILER) -fopenmp test.c)
OPENMP ?= $(shell find a.out 2>/dev/null)
ifeq ($(OPENMP),)
$(info -----------------------------------------------------------------------------------------------)
$(info WARNING - OpenMP is unable to compile)
$(info -----------------------------------------------------------------------------------------------)
$(info This CUDA Sample cannot be built if the OpenMP compiler is not set up correctly.)
$(info This will be a dry-run of the Makefile.)
$(info For more information on how to set up your environment to build and run this )
$(info sample, please refer the CUDA Samples documentation and release notes)
$(info -----------------------------------------------------------------------------------------------)
SAMPLE_ENABLED := 0
endif
$(shell rm a.out test.c 2>/dev/null)
else
LIBRARIES += -lpthread
ALL_CCFLAGS += -DUSE_PTHREADS
endif
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
LIBRARIES += -lcublas
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: UnifiedMemoryStreams
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
UnifiedMemoryStreams.o:UnifiedMemoryStreams.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
UnifiedMemoryStreams: UnifiedMemoryStreams.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./UnifiedMemoryStreams
testrun: build
clean:
rm -f UnifiedMemoryStreams UnifiedMemoryStreams.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/UnifiedMemoryStreams
clobber: clean

View File

@ -1,101 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>UnifiedMemoryStreams</name>
<cuda_api_list>
<toolkit>cudaStreamDestroy</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaMallocManaged</toolkit>
<toolkit>cudaStreamCreate</toolkit>
<toolkit>cudaDeviceSynchronize</toolkit>
<toolkit>cudaStreamAttachMemAsync</toolkit>
<toolkit>cudaSetDevice</toolkit>
<toolkit>cudaStreamSynchronize</toolkit>
<toolkit>cudaGetDeviceProperties</toolkit>
</cuda_api_list>
<description><![CDATA[This sample demonstrates the use of OpenMP and streams with Unified Memory on a single GPU.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">CUDA Systems Integration</concept>
<concept level="basic">OpenMP</concept>
<concept level="basic">CUBLAS</concept>
<concept level="basic">Multithreading</concept>
<concept level="basic">Unified Memory</concept>
<concept level="basic">CUDA Streams and Events</concept>
</keyconcepts>
<keywords>
<keyword>CUDA</keyword>
<keyword>CUBLAS</keyword>
<keyword>OpenMP</keyword>
<keyword>cluster</keyword>
<keyword>multi-GPU Support</keyword>
<keyword>Unified Memory</keyword>
<keyword>UVM</keyword>
<keyword>openMP</keyword>
<keyword>Streams</keyword>
<keyword>pthreads</keyword>
</keywords>
<libraries>
<library>cublas</library>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>UnifiedMemoryStreams.cu</primary_file>
<required_dependencies>
<dependency>OpenMP</dependency>
<dependency>UVM</dependency>
<dependency>CUBLAS</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>1:CUDA Systems Integration</scope>
<scope>1:Unified Memory</scope>
</scopes>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<from>3.5</from>
</supported_sm_architectures>
<title>Unified Memory Streams</title>
<type>exe</type>
</entry>

View File

@ -10,65 +10,25 @@ CUDA Systems Integration, OpenMP, CUBLAS, Multithreading, Unified Memory, CUDA S
## Supported SM Architectures ## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes ## Supported OSes
Linux, Windows Linux, Windows
## Supported CPU Architecture ## Supported CPU Architecture
x86_64, ppc64le, armv7l x86_64, armv7l
## CUDA APIs involved ## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamCreate, cudaDeviceSynchronize, cudaStreamAttachMemAsync, cudaSetDevice, cudaStreamSynchronize, cudaGetDeviceProperties cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSetDevice, cudaDeviceSynchronize, cudaStreamSynchronize, cudaStreamCreate, cudaGetDeviceProperties
## Dependencies needed to build/run ## Dependencies needed to build/run
[OpenMP](../../README.md#openmp), [UVM](../../README.md#uvm), [CUBLAS](../../README.md#cublas) [OpenMP](../../../README.md#openmp), [UVM](../../../README.md#uvm), [CUBLAS](../../../README.md#cublas)
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details) ## References (for more details)

View File

@ -31,10 +31,10 @@
*/ */
// system includes // system includes
#include <algorithm>
#include <cstdio> #include <cstdio>
#include <ctime> #include <ctime>
#include <vector> #include <vector>
#include <algorithm>
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
#include <pthread.h> #include <pthread.h>
#else #else
@ -51,291 +51,287 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
// functions // functions
void srand48(long seed) { srand((unsigned int)seed); } void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; } double drand48() { return double(rand()) / RAND_MAX; }
#endif #endif
const char *sSDKname = "UnifiedMemoryStreams"; const char *sSDKname = "UnifiedMemoryStreams";
// simple task // simple task
template <typename T> template <typename T> struct Task
struct Task { {
unsigned int size, id; unsigned int size, id;
T *data; T *data;
T *result; T *result;
T *vector; T *vector;
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){}; Task()
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) { : size(0)
// allocate unified memory -- the operation performed in this example will , id(0)
// be a DGEMV , data(NULL)
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); , result(NULL)
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size)); , vector(NULL) {};
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size)); Task(unsigned int s)
checkCudaErrors(cudaDeviceSynchronize()); : size(s)
} , id(0)
, data(NULL)
~Task() { , result(NULL)
// ensure all memory is deallocated {
checkCudaErrors(cudaDeviceSynchronize()); // allocate unified memory -- the operation performed in this example will
checkCudaErrors(cudaFree(data)); // be a DGEMV
checkCudaErrors(cudaFree(result)); checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaFree(vector)); checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
} checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
void allocate(const unsigned int s, const unsigned int unique_id) {
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
} }
for (unsigned int i = 0; i < size; i++) { ~Task()
result[i] = 0.; {
vector[i] = drand48(); // ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data));
checkCudaErrors(cudaFree(result));
checkCudaErrors(cudaFree(vector));
}
void allocate(const unsigned int s, const unsigned int unique_id)
{
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
}
for (unsigned int i = 0; i < size; i++) {
result[i] = 0.;
vector[i] = drand48();
}
} }
}
}; };
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
struct threadData_t { struct threadData_t
int tid; {
Task<double> *TaskListPtr; int tid;
cudaStream_t *streams; Task<double> *TaskListPtr;
cublasHandle_t *handles; cudaStream_t *streams;
int taskSize; cublasHandle_t *handles;
int taskSize;
}; };
typedef struct threadData_t threadData; typedef struct threadData_t threadData;
#endif #endif
// simple host dgemv: assume data is in row-major format and square // simple host dgemv: assume data is in row-major format and square
template <typename T> template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) { {
// rows // rows
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
result[i] *= beta; result[i] *= beta;
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++) {
result[i] += A[i * n + j] * x[j]; result[i] += A[i * n + j] * x[j];
}
} }
}
} }
// execute a single task on either host or device depending on size // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
void *execute(void *inpArgs) { void *execute(void *inpArgs)
threadData *dataPtr = (threadData *)inpArgs; {
cudaStream_t *stream = dataPtr->streams; threadData *dataPtr = (threadData *)inpArgs;
cublasHandle_t *handle = dataPtr->handles; cudaStream_t *stream = dataPtr->streams;
int tid = dataPtr->tid; cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid;
for (int i = 0; i < dataPtr->taskSize; i++) { for (int i = 0; i < dataPtr->taskSize; i++) {
Task<double> &t = dataPtr->TaskListPtr[i]; Task<double> &t = dataPtr->TaskListPtr[i];
if (t.size < 100) { if (t.size < 100) {
// perform on host // perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
t.size);
// attach managed memory to a (dummy) stream to allow host access while // attach managed memory to a (dummy) stream to allow host access while
// the device is running // the device is running
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors( checkCudaErrors(cudaStreamSynchronize(stream[0]));
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); // call the host operation
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
checkCudaErrors(cudaStreamSynchronize(stream[0])); }
// call the host operation else {
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); // perform on device
} else { printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
// perform on device double one = 1.0;
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, double zero = 0.0;
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream // attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); // call the device operation
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, checkCudaErrors(cublasDgemv(
cudaMemAttachSingle)); handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
// call the device operation }
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
} }
}
pthread_exit(NULL); pthread_exit(NULL);
} }
#else #else
template <typename T> template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, {
int tid) { if (t.size < 100) {
if (t.size < 100) { // perform on host
// perform on host printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
t.size);
// attach managed memory to a (dummy) stream to allow host access while the // attach managed memory to a (dummy) stream to allow host access while the
// device is running // device is running
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors( checkCudaErrors(cudaStreamSynchronize(stream[0]));
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); // call the host operation
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
checkCudaErrors(cudaStreamSynchronize(stream[0])); }
// call the host operation else {
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); // perform on device
} else { printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
// perform on device double one = 1.0;
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, double zero = 0.0;
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream // attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); // call the device operation
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, checkCudaErrors(cublasDgemv(
cudaMemAttachSingle)); handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
// call the device operation }
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
}
} }
#endif #endif
// populate a list of tasks with random sizes // populate a list of tasks with random sizes
template <typename T> template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
void initialise_tasks(std::vector<Task<T> > &TaskList) { {
for (unsigned int i = 0; i < TaskList.size(); i++) { for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size // generate random size
int size; int size;
size = std::max((int)(drand48() * 1000.0), 64); size = std::max((int)(drand48() * 1000.0), 64);
TaskList[i].allocate(size, i); TaskList[i].allocate(size, i);
} }
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
// set device {
cudaDeviceProp device_prop; // set device
int dev_id = findCudaDevice(argc, (const char **)argv); cudaDeviceProp device_prop;
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) { if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory // This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n"); fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
if (device_prop.computeMode == cudaComputeModeProhibited) { if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode // This sample requires being run with a default or process exclusive mode
fprintf(stderr, fprintf(stderr,
"This sample requires a device in either default or process " "This sample requires a device in either default or process "
"exclusive mode\n"); "exclusive mode\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
// randomise task sizes // randomise task sizes
int seed = (int)time(NULL); int seed = (int)time(NULL);
srand48(seed); srand48(seed);
// set number of threads // set number of threads
const int nthreads = 4; const int nthreads = 4;
// number of streams = number of threads // number of streams = number of threads
cudaStream_t *streams = new cudaStream_t[nthreads + 1]; cudaStream_t *streams = new cudaStream_t[nthreads + 1];
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1]; cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
for (int i = 0; i < nthreads + 1; i++) { for (int i = 0; i < nthreads + 1; i++) {
checkCudaErrors(cudaStreamCreate(&streams[i])); checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cublasCreate(&handles[i])); checkCudaErrors(cublasCreate(&handles[i]));
} }
// create list of N tasks // create list of N tasks
unsigned int N = 40; unsigned int N = 40;
std::vector<Task<double> > TaskList(N); std::vector<Task<double>> TaskList(N);
initialise_tasks(TaskList); initialise_tasks(TaskList);
printf("Executing tasks on host / device\n"); printf("Executing tasks on host / device\n");
// run through all tasks using threads and streams // run through all tasks using threads and streams
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
pthread_t threads[nthreads]; pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads]; threadData *InputToThreads = new threadData[nthreads];
for (int i = 0; i < nthreads; i++) { for (int i = 0; i < nthreads; i++) {
checkCudaErrors(cudaSetDevice(dev_id)); checkCudaErrors(cudaSetDevice(dev_id));
InputToThreads[i].tid = i; InputToThreads[i].tid = i;
InputToThreads[i].streams = streams; InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; InputToThreads[i].handles = handles;
if ((TaskList.size() / nthreads) == 0) { if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
&TaskList[i * (TaskList.size() / nthreads)]; }
} else { else {
if (i == nthreads - 1) { if (i == nthreads - 1) {
InputToThreads[i].taskSize = InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
(TaskList.size() / nthreads) + (TaskList.size() % nthreads); InputToThreads[i].TaskListPtr =
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
&TaskList[i * (TaskList.size() / nthreads) + }
(TaskList.size() % nthreads)]; else {
} else { InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
InputToThreads[i].TaskListPtr = }
&TaskList[i * (TaskList.size() / nthreads)]; }
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
} }
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
#else #else
omp_set_num_threads(nthreads); omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic) #pragma omp parallel for schedule(dynamic)
for (int i = 0; i < TaskList.size(); i++) { for (int i = 0; i < TaskList.size(); i++) {
checkCudaErrors(cudaSetDevice(dev_id)); checkCudaErrors(cudaSetDevice(dev_id));
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid); execute(TaskList[i], handles, streams, tid);
} }
#endif #endif
cudaDeviceSynchronize(); cudaDeviceSynchronize();
// Destroy CUDA Streams, cuBlas handles // Destroy CUDA Streams, cuBlas handles
for (int i = 0; i < nthreads + 1; i++) { for (int i = 0; i < nthreads + 1; i++) {
cudaStreamDestroy(streams[i]); cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]); cublasDestroy(handles[i]);
} }
// Free TaskList // Free TaskList
std::vector<Task<double> >().swap(TaskList); std::vector<Task<double>>().swap(TaskList);
printf("All Done!\n"); printf("All Done!\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,113 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>UnifiedMemoryStreams_vs2017</RootNamespace>
<ProjectName>UnifiedMemoryStreams</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
<AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="UnifiedMemoryStreams.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,109 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>UnifiedMemoryStreams_vs2019</RootNamespace>
<ProjectName>UnifiedMemoryStreams</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
<AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="UnifiedMemoryStreams.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UnifiedMemoryStreams", "UnifiedMemoryStreams_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,109 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>UnifiedMemoryStreams_vs2022</RootNamespace>
<ProjectName>UnifiedMemoryStreams</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cublas.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/UnifiedMemoryStreams.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
<AdditionalCompilerOptions>/openmp</AdditionalCompilerOptions>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="UnifiedMemoryStreams.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,30 @@
cmake_minimum_required(VERSION 3.20)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
project(asyncAPI LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
include_directories(../../../Common)
# Source file
# Add target for asyncAPI
add_executable(asyncAPI asyncAPI.cu)
target_compile_options(asyncAPI PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_features(asyncAPI PRIVATE cxx_std_17 cuda_std_17)
set_target_properties(asyncAPI PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@ -1,341 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: asyncAPI
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
asyncAPI.o:asyncAPI.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
asyncAPI: asyncAPI.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./asyncAPI
testrun: build
$(EXEC) ./asyncAPI --dummy-test-param
clean:
rm -f asyncAPI asyncAPI.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/asyncAPI
clobber: clean

View File

@ -1,90 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>asyncAPI</name>
<cuda_api_list>
<toolkit>cudaMemset</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaEventRecord</toolkit>
<toolkit>cudaMallocHost</toolkit>
<toolkit>cudaProfilerStart</toolkit>
<toolkit>cudaEventCreate</toolkit>
<toolkit>cudaEventElapsedTime</toolkit>
<toolkit>cudaDeviceSynchronize</toolkit>
<toolkit>cudaFreeHost</toolkit>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaEventQuery</toolkit>
<toolkit>cudaProfilerStop</toolkit>
<toolkit>cudaEventDestroy</toolkit>
<toolkit>cudaMemcpyAsync</toolkit>
<toolkit>cudaGetDeviceProperties</toolkit>
</cuda_api_list>
<description><![CDATA[This sample illustrates the usage of CUDA events for both GPU timing and overlapping CPU and GPU execution. Events are inserted into a stream of CUDA calls. Since CUDA stream calls are asynchronous, the CPU can perform computations while GPU is executing (including DMA memcopies between the host and device). CPU can query CUDA events to determine whether GPU has completed tasks.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">Asynchronous Data Transfers</concept>
<concept level="basic">CUDA Streams and Events</concept>
</keyconcepts>
<keywords>
<keyword>GPGPU</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>asyncAPI.cu</primary_file>
<qatests>
<qatest>--dummy-test-param</qatest>
</qatests>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>1:Performance Strategies</scope>
</scopes>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>asyncAPI</title>
<type>exe</type>
</entry>

View File

@ -10,7 +10,7 @@ Asynchronous Data Transfers, CUDA Streams and Events
## Supported SM Architectures ## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes ## Supported OSes
@ -18,53 +18,15 @@ Linux, Windows
## Supported CPU Architecture ## Supported CPU Architecture
x86_64, ppc64le, armv7l x86_64, armv7l
## CUDA APIs involved ## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaMemset, cudaFree, cudaEventRecord, cudaMallocHost, cudaProfilerStart, cudaEventCreate, cudaEventElapsedTime, cudaDeviceSynchronize, cudaFreeHost, cudaMalloc, cudaEventQuery, cudaProfilerStop, cudaEventDestroy, cudaMemcpyAsync, cudaGetDeviceProperties cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaProfilerStart, cudaDeviceSynchronize, cudaEventRecord, cudaFreeHost, cudaMemset, cudaEventDestroy, cudaEventQuery, cudaEventElapsedTime, cudaGetDeviceProperties, cudaEventCreate
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details) ## References (for more details)

View File

@ -38,105 +38,107 @@
#include <stdio.h> #include <stdio.h>
// includes CUDA Runtime // includes CUDA Runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper utility functions #include <helper_functions.h> // helper utility functions
__global__ void increment_kernel(int *g_data, int inc_value) { __global__ void increment_kernel(int *g_data, int inc_value)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
g_data[idx] = g_data[idx] + inc_value; int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
} }
bool correct_output(int *data, const int n, const int x) { bool correct_output(int *data, const int n, const int x)
for (int i = 0; i < n; i++) {
if (data[i] != x) { for (int i = 0; i < n; i++)
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); if (data[i] != x) {
return false; printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
}
int main(int argc, char *argv[])
{
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv);
// get device name
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
} }
return true; checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
}
// print the cpu and gpu times
int main(int argc, char *argv[]) { printf("time spent executing by the GPU: %.2f\n", gpu_time);
int devID; printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
cudaDeviceProp deviceProps; printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
printf("[%s] - Starting...\n", argv[0]); // check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv); // release resources
checkCudaErrors(cudaEventDestroy(start));
// get device name checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaFreeHost(a));
printf("CUDA device [%s]\n", deviceProps.name); checkCudaErrors(cudaFree(d_a));
int n = 16 * 1024 * 1024; exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
}
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
counter);
// check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFree(d_a));
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,112 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>asyncAPI_vs2017</RootNamespace>
<ProjectName>asyncAPI</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="asyncAPI.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>asyncAPI_vs2019</RootNamespace>
<ProjectName>asyncAPI</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="asyncAPI.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asyncAPI", "asyncAPI_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>asyncAPI_vs2022</RootNamespace>
<ProjectName>asyncAPI</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/asyncAPI.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="asyncAPI.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,372 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
# This sample is not supported on QNX
ifeq ($(TARGET_OS),qnx)
$(info >>> WARNING - c++11_cuda is not supported on QNX - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
#Detect if installed version of GCC supports required C++11
ifeq ($(TARGET_OS),linux)
empty :=
space := $(empty) $(empty)
GCCVERSIONSTRING := $(shell expr `$(HOST_COMPILER) -dumpversion`)
#Create version number without "."
GCCVERSION := $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f1 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f2 -d.)
GCCVERSION += $(shell expr `echo $(GCCVERSIONSTRING)` | cut -f3 -d.)
# Make sure the version number has at least 3 decimals
GCCVERSION += 00
# Remove spaces from the version number
GCCVERSION := $(subst $(space),$(empty),$(GCCVERSION))
#$(warning $(GCCVERSION))
IS_MIN_VERSION := $(shell expr `echo $(GCCVERSION)` \>= 47000)
ifeq ($(IS_MIN_VERSION), 1)
$(info >>> GCC Version is greater or equal to 4.7.0 <<<)
else
$(info >>> Waiving build. Minimum GCC version required is 4.7.0<<<)
SAMPLE_ENABLED := 0
endif
endif
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --std=c++11 --threads 0
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: c++11_cuda
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
c++11_cuda.o:c++11_cuda.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
c++11_cuda: c++11_cuda.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./c++11_cuda
testrun: build
$(EXEC) ./c++11_cuda --dummy-test-param
clean:
rm -f c++11_cuda c++11_cuda.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/c++11_cuda
clobber: clean

View File

@ -1,85 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>c++11_cuda</name>
<cflags>
<flag>--std=c++11</flag>
</cflags>
<cuda_api_list>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaMemset</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaMemcpy</toolkit>
</cuda_api_list>
<description><![CDATA[This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters. ]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="advanced">CPP11 CUDA</concept>
</keyconcepts>
<keywords>
<keyword>GPGPU</keyword>
<keyword>CPP11</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>c++11_cuda.cu</primary_file>
<qatests>
<qatest>--dummy-test-param</qatest>
</qatests>
<required_dependencies>
<dependency>CPP11</dependency>
</required_dependencies>
<scopes>
<scope>1:CUDA Advanced Topics</scope>
<scope>1:C++11 CUDA</scope>
</scopes>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>C++11 CUDA</title>
<type>exe</type>
</entry>

View File

@ -1,74 +0,0 @@
# c++11_cuda - C++11 CUDA
## Description
This sample demonstrates C++11 feature support in CUDA. It scans a input text file and prints no. of occurrences of x, y, z, w characters.
## Key Concepts
CPP11 CUDA
## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le, armv7l
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaMalloc, cudaMemset, cudaFree, cudaMemcpy
## Dependencies needed to build/run
[CPP11](../../README.md#cpp11)
## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -1,140 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <thrust/device_ptr.h>
#include <thrust/count.h>
#include <thrust/execution_policy.h>
#include <iostream>
#include <helper_cuda.h>
/////////////////////////////////////////////////////////////////
// Some utility code to define grid_stride_range
// Normally this would be in a header but it's here
// for didactic purposes. Uses
#include "range.hpp"
using namespace util::lang;
// type alias to simplify typing...
template <typename T>
using step_range = typename range_proxy<T>::step_range_proxy;
template <typename T>
__device__ step_range<T> grid_stride_range(T begin, T end) {
begin += blockDim.x * blockIdx.x + threadIdx.x;
return range(begin, end).step(gridDim.x * blockDim.x);
}
/////////////////////////////////////////////////////////////////
template <typename T, typename Predicate>
__device__ void count_if(int *count, T *data, int n, Predicate p) {
for (auto i : grid_stride_range(0, n)) {
if (p(data[i])) atomicAdd(count, 1);
}
}
// Use count_if with a lambda function that searches for x, y, z or w
// Note the use of range-based for loop and initializer_list inside the functor
// We use auto so we don't have to know the type of the functor or array
__global__ void xyzw_frequency(int *count, char *text, int n) {
const char letters[]{'x', 'y', 'z', 'w'};
count_if(count, text, n, [&](char c) {
for (const auto x : letters)
if (c == x) return true;
return false;
});
}
__global__ void xyzw_frequency_thrust_device(int *count, char *text, int n) {
const char letters[]{'x', 'y', 'z', 'w'};
*count = thrust::count_if(thrust::device, text, text + n, [=](char c) {
for (const auto x : letters)
if (c == x) return true;
return false;
});
}
// a bug in Thrust 1.8 causes warnings when this is uncommented
// so commented out by default -- fixed in Thrust master branch
#if 0
void xyzw_frequency_thrust_host(int *count, char *text, int n)
{
const char letters[] { 'x','y','z','w' };
*count = thrust::count_if(thrust::host, text, text+n, [&](char c) {
for (const auto x : letters)
if (c == x) return true;
return false;
});
}
#endif
int main(int argc, char **argv) {
const char *filename = sdkFindFilePath("warandpeace.txt", argv[0]);
int numBytes = 16 * 1048576;
char *h_text = (char *)malloc(numBytes);
// find first CUDA device
int devID = findCudaDevice(argc, (const char **)argv);
char *d_text;
checkCudaErrors(cudaMalloc((void **)&d_text, numBytes));
FILE *fp = fopen(filename, "r");
if (fp == NULL) {
printf("Cannot find the input text file\n. Exiting..\n");
return EXIT_FAILURE;
}
int len = (int)fread(h_text, sizeof(char), numBytes, fp);
fclose(fp);
std::cout << "Read " << len << " byte corpus from " << filename << std::endl;
checkCudaErrors(cudaMemcpy(d_text, h_text, len, cudaMemcpyHostToDevice));
int count = 0;
int *d_count;
checkCudaErrors(cudaMalloc(&d_count, sizeof(int)));
checkCudaErrors(cudaMemset(d_count, 0, sizeof(int)));
// Try uncommenting one kernel call at a time
xyzw_frequency<<<8, 256>>>(d_count, d_text, len);
xyzw_frequency_thrust_device<<<1, 1>>>(d_count, d_text, len);
checkCudaErrors(
cudaMemcpy(&count, d_count, sizeof(int), cudaMemcpyDeviceToHost));
// xyzw_frequency_thrust_host(&count, h_text, len);
std::cout << "counted " << count
<< " instances of 'x', 'y', 'z', or 'w' in \"" << filename << "\""
<< std::endl;
checkCudaErrors(cudaFree(d_count));
checkCudaErrors(cudaFree(d_text));
return EXIT_SUCCESS;
}

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,112 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>c++11_cuda_vs2017</RootNamespace>
<ProjectName>c++11_cuda</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="c++11_cuda.cu" />
<ClInclude Include="range.hpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>c++11_cuda_vs2019</RootNamespace>
<ProjectName>c++11_cuda</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="c++11_cuda.cu" />
<ClInclude Include="range.hpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "c++11_cuda", "c++11_cuda_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>c++11_cuda_vs2022</RootNamespace>
<ProjectName>c++11_cuda</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/c++11_cuda.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="c++11_cuda.cu" />
<ClInclude Include="range.hpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,279 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef UTIL_LANG_RANGE_HPP
#define UTIL_LANG_RANGE_HPP
#include <iterator>
#include <type_traits>
// Make these ranges usable inside CUDA C++ device code
#ifdef __CUDACC__
#define DEVICE_CALLABLE __host__ __device__
#else
#define DEVICE_CALLABLE
#endif
namespace util {
namespace lang {
namespace detail {
template <typename T>
struct range_iter_base : std::iterator<std::input_iterator_tag, T> {
DEVICE_CALLABLE
range_iter_base(T current) : current(current) {}
DEVICE_CALLABLE
T operator*() const { return current; }
DEVICE_CALLABLE
T const* operator->() const { return &current; }
DEVICE_CALLABLE
range_iter_base& operator++() {
++current;
return *this;
}
DEVICE_CALLABLE
range_iter_base operator++(int) {
auto copy = *this;
++*this;
return copy;
}
DEVICE_CALLABLE
bool operator==(range_iter_base const& other) const {
return current == other.current;
}
DEVICE_CALLABLE
bool operator!=(range_iter_base const& other) const {
return not(*this == other);
}
protected:
T current;
};
} // namespace detail
template <typename T>
struct range_proxy {
struct iter : detail::range_iter_base<T> {
DEVICE_CALLABLE
iter(T current) : detail::range_iter_base<T>(current) {}
};
struct step_range_proxy {
struct iter : detail::range_iter_base<T> {
DEVICE_CALLABLE
iter(T current, T step)
: detail::range_iter_base<T>(current), step(step) {}
using detail::range_iter_base<T>::current;
DEVICE_CALLABLE
iter& operator++() {
current += step;
return *this;
}
DEVICE_CALLABLE
iter operator++(int) {
auto copy = *this;
++*this;
return copy;
}
// Loses commutativity. Iterator-based ranges are simply broken. :-(
DEVICE_CALLABLE
bool operator==(iter const& other) const {
return step > 0 ? current >= other.current : current < other.current;
}
DEVICE_CALLABLE
bool operator!=(iter const& other) const { return !(*this == other); }
private:
T step;
};
DEVICE_CALLABLE
step_range_proxy(T begin, T end, T step)
: begin_(begin, step), end_(end, step) {}
DEVICE_CALLABLE
iter begin() const { return begin_; }
DEVICE_CALLABLE
iter end() const { return end_; }
private:
iter begin_;
iter end_;
};
DEVICE_CALLABLE
range_proxy(T begin, T end) : begin_(begin), end_(end) {}
DEVICE_CALLABLE
step_range_proxy step(T step) { return {*begin_, *end_, step}; }
DEVICE_CALLABLE
iter begin() const { return begin_; }
DEVICE_CALLABLE
iter end() const { return end_; }
private:
iter begin_;
iter end_;
};
template <typename T>
struct infinite_range_proxy {
struct iter : detail::range_iter_base<T> {
DEVICE_CALLABLE
iter(T current = T()) : detail::range_iter_base<T>(current) {}
DEVICE_CALLABLE
bool operator==(iter const&) const { return false; }
DEVICE_CALLABLE
bool operator!=(iter const&) const { return true; }
};
struct step_range_proxy {
struct iter : detail::range_iter_base<T> {
DEVICE_CALLABLE
iter(T current = T(), T step = T())
: detail::range_iter_base<T>(current), step(step) {}
using detail::range_iter_base<T>::current;
DEVICE_CALLABLE
iter& operator++() {
current += step;
return *this;
}
DEVICE_CALLABLE
iter operator++(int) {
auto copy = *this;
++*this;
return copy;
}
DEVICE_CALLABLE
bool operator==(iter const&) const { return false; }
DEVICE_CALLABLE
bool operator!=(iter const&) const { return true; }
private:
T step;
};
DEVICE_CALLABLE
step_range_proxy(T begin, T step) : begin_(begin, step) {}
DEVICE_CALLABLE
iter begin() const { return begin_; }
DEVICE_CALLABLE
iter end() const { return iter(); }
private:
iter begin_;
};
DEVICE_CALLABLE
infinite_range_proxy(T begin) : begin_(begin) {}
DEVICE_CALLABLE
step_range_proxy step(T step) { return step_range_proxy(*begin_, step); }
DEVICE_CALLABLE
iter begin() const { return begin_; }
DEVICE_CALLABLE
iter end() const { return iter(); }
private:
iter begin_;
};
template <typename T>
DEVICE_CALLABLE range_proxy<T> range(T begin, T end) {
return {begin, end};
}
template <typename T>
DEVICE_CALLABLE infinite_range_proxy<T> range(T begin) {
return {begin};
}
namespace traits {
template <typename C>
struct has_size {
template <typename T>
static constexpr auto check(T*) ->
typename std::is_integral<decltype(std::declval<T const>().size())>::type;
template <typename>
static constexpr auto check(...) -> std::false_type;
using type = decltype(check<C>(0));
static constexpr bool value = type::value;
};
} // namespace traits
template <typename C,
typename = typename std::enable_if<traits::has_size<C>::value>>
DEVICE_CALLABLE auto indices(C const& cont)
-> range_proxy<decltype(cont.size())> {
return {0, cont.size()};
}
template <typename T, std::size_t N>
DEVICE_CALLABLE range_proxy<std::size_t> indices(T(&)[N]) {
return {0, N};
}
template <typename T>
range_proxy<typename std::initializer_list<T>::size_type> DEVICE_CALLABLE
indices(std::initializer_list<T>&& cont) {
return {0, cont.size()};
}
}
} // namespace util::lang
#endif // ndef UTIL_LANG_RANGE_HPP

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
cmake_minimum_required(VERSION 3.20)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
project(clock LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
include_directories(../../../Common)
# Source file
# Add target for asyncAPI
add_executable(clock clock.cu)
target_compile_options(clock PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_features(clock PRIVATE cxx_std_17 cuda_std_17)
set_target_properties(clock PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

View File

@ -1,340 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: clock
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
clock.o:clock.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
clock: clock.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./clock
testrun: build
clean:
rm -f clock clock.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/clock
clobber: clean

View File

@ -1,78 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>clock</name>
<cuda_api_list>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaMemcpy</toolkit>
</cuda_api_list>
<description><![CDATA[This example shows how to use the clock function to measure the performance of block of threads of a kernel accurately.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">Performance Strategies</concept>
</keyconcepts>
<keywords>
<keyword>performance</keyword>
<keyword>timing</keyword>
<keyword>CUDA</keyword>
<keyword>clock</keyword>
<keyword>timer</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>clock.cu</primary_file>
<scopes>
<scope>1:CUDA Basic Topics</scope>
<scope>1:Performance Strategies</scope>
</scopes>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>Clock</title>
<type>exe</type>
</entry>

View File

@ -10,7 +10,7 @@ Performance Strategies
## Supported SM Architectures ## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes ## Supported OSes
@ -18,53 +18,15 @@ Linux, Windows
## Supported CPU Architecture ## Supported CPU Architecture
x86_64, ppc64le, armv7l x86_64, armv7l
## CUDA APIs involved ## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaMalloc, cudaFree, cudaMemcpy cudaMalloc, cudaMemcpy, cudaFree
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details) ## References (for more details)

View File

@ -48,43 +48,46 @@
// This kernel computes a standard parallel reduction and evaluates the // This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored // time it takes to do that for each block. The timing results are stored
// in device memory. // in device memory.
__global__ static void timedReduction(const float *input, float *output, __global__ static void timedReduction(const float *input, float *output, clock_t *timer)
clock_t *timer) { {
// __shared__ float shared[2 * blockDim.x]; // __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[]; extern __shared__ float shared[];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock(); if (tid == 0)
timer[bid] = clock();
// Copy input. // Copy input.
shared[tid] = input[tid]; shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x]; shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads(); __syncthreads();
if (tid < d) { if (tid == 0)
float f0 = shared[tid]; timer[bid + gridDim.x] = clock();
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
} }
#define NUM_BLOCKS 64 #define NUM_BLOCKS 64
#define NUM_THREADS 256 #define NUM_THREADS 256
// It's interesting to change the number of blocks and the number of threads to // It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
// the memory. With more than 32 the speed scales linearly. // the memory. With more than 32 the speed scales linearly.
// Start the main CUDA Sample here // Start the main CUDA Sample here
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("CUDA Clock sample\n"); {
printf("CUDA Clock sample\n");
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv); int dev = findCudaDevice(argc, (const char **)argv);
float *dinput = NULL; float *dinput = NULL;
float *doutput = NULL; float *doutput = NULL;
clock_t *dtimer = NULL; clock_t *dtimer = NULL;
clock_t timer[NUM_BLOCKS * 2]; clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2]; float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) { for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i; input[i] = (float)i;
} }
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>( timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
dinput, doutput, dtimer);
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dinput)); checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput)); checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer)); checkCudaErrors(cudaFree(dtimer));
long double avgElapsedClocks = 0; long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) { for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
} }
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks); printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,112 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>clock_vs2017</RootNamespace>
<ProjectName>clock</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/clock.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="clock.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>clock_vs2019</RootNamespace>
<ProjectName>clock</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/clock.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="clock.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock", "clock_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>clock_vs2022</RootNamespace>
<ProjectName>clock</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/clock.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="clock.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -0,0 +1,39 @@
cmake_minimum_required(VERSION 3.20)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
project(clock_nvrtc LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
include_directories(../../../Common)
# Source file
# Add sample target executable
add_executable(clock_nvrtc clock.cpp)
target_compile_options(clock_nvrtc PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
target_compile_features(clock_nvrtc PRIVATE cxx_std_17 cuda_std_17)
target_link_libraries(clock_nvrtc PRIVATE
CUDA::nvrtc
CUDA::cuda_driver
)
# Copy clock_kernel.cu to the output directory
add_custom_command(TARGET clock_nvrtc POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_CURRENT_SOURCE_DIR}/clock_kernel.cu ${CMAKE_CURRENT_BINARY_DIR}
)

View File

@ -1,392 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
UBUNTU = $(shell lsb_release -i -s 2>/dev/null | grep -i ubuntu)
SAMPLE_ENABLED := 1
# This sample is not supported on ARMv7
ifeq ($(TARGET_ARCH),armv7l)
$(info >>> WARNING - clock_nvrtc is not supported on ARMv7 - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# libNVRTC specific libraries
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -L$(CUDA_PATH)/lib -F/Library/Frameworks -framework CUDA
endif
ifeq ($(TARGET_OS),darwin)
ALL_LDFLAGS += -Xcompiler -F/Library/Frameworks -Xlinker -framework -Xlinker CUDA
else
ifeq ($(TARGET_ARCH),x86_64)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/lib64/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib/stubs
CUDA_SEARCH_PATH += $(CUDA_PATH)/targets/x86_64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-gnueabihf/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/sbsa-linux/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/armv7-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-linux-androideabi/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ARMv7-linux-QNX/lib/stubs
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/aarch64-qnx/lib/stubs
ifdef TARGET_OVERRIDE
CUDA_SEARCH_PATH := $(CUDA_PATH)/targets/$(TARGET_OVERRIDE)/lib/stubs
endif
endif
ifeq ($(TARGET_ARCH),ppc64le)
CUDA_SEARCH_PATH ?= $(CUDA_PATH)/targets/ppc64le-linux/lib/stubs
endif
ifeq ($(HOST_ARCH),ppc64le)
CUDA_SEARCH_PATH += $(CUDA_PATH)/lib64/stubs
endif
CUDALIB ?= $(shell find -L $(CUDA_SEARCH_PATH) -maxdepth 1 -name libcuda.so 2> /dev/null)
ifeq ("$(CUDALIB)","")
$(info >>> WARNING - libcuda.so not found, CUDA Driver is not installed. Please re-install the driver. <<<)
SAMPLE_ENABLED := 0
else
CUDALIB := $(shell echo $(CUDALIB) | sed "s/ .*//" | sed "s/\/libcuda.so//" )
LIBRARIES += -L$(CUDALIB) -lcuda
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
INCLUDES += -I$(CUDA_PATH)/include
LIBRARIES += -lnvrtc
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: clock_nvrtc
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
clock.o:clock.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
clock_nvrtc: clock.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./clock_nvrtc
testrun: build
clean:
rm -f clock_nvrtc clock.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/clock_nvrtc
clobber: clean

View File

@ -10,7 +10,7 @@ Performance Strategies, Runtime Compilation
## Supported SM Architectures ## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus) [SM 8.9 ](https://developer.nvidia.com/cuda-gpus) [SM 9.0 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes ## Supported OSes
@ -18,60 +18,22 @@ Linux, Windows, QNX
## Supported CPU Architecture ## Supported CPU Architecture
x86_64, ppc64le, aarch64 x86_64, aarch64
## CUDA APIs involved ## CUDA APIs involved
### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html) ### [CUDA Driver API](http://docs.nvidia.com/cuda/cuda-driver-api/index.html)
cuModuleGetFunction, cuMemAlloc, cuLaunchKernel, cuCtxSynchronize, cuMemFree, cuMemcpyDtoH, cuMemcpyHtoD cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemFree, cuModuleGetFunction
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaBlockSize, cudaGridSize cudaBlockSize, cudaGridSize
## Dependencies needed to build/run ## Dependencies needed to build/run
[NVRTC](../../README.md#nvrtc) [NVRTC](../../../README.md#nvrtc)
## Prerequisites ## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, aarch64.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=aarch64` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details) ## References (for more details)

View File

@ -34,12 +34,11 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <nvrtc_helper.h> #include <nvrtc_helper.h>
#include <stdint.h>
#include <stdio.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
@ -71,64 +70,68 @@
// Start the main CUDA Sample here // Start the main CUDA Sample here
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("CUDA Clock sample\n"); {
printf("CUDA Clock sample\n");
typedef long clock_t; typedef long clock_t;
clock_t timer[NUM_BLOCKS * 2]; clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2]; float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) { for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i; input[i] = (float)i;
} }
char *cubin, *kernel_file; char *cubin, *kernel_file;
size_t cubinSize; size_t cubinSize;
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]); kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv); CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr; CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction")); checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
dim3 cudaBlockSize(NUM_THREADS, 1, 1); dim3 cudaBlockSize(NUM_THREADS, 1, 1);
dim3 cudaGridSize(NUM_BLOCKS, 1, 1); dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
CUdeviceptr dinput, doutput, dtimer; CUdeviceptr dinput, doutput, dtimer;
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2)); checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS)); checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2)); checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer}; void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(kernel_addr,
kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.x,
cudaGridSize.z, /* grid dim */ cudaGridSize.y,
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ cudaGridSize.z, /* grid dim */
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */ cudaBlockSize.x,
&arr[0], /* arguments */ cudaBlockSize.y,
0)); cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
checkCudaErrors( checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(dinput)); checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(doutput)); checkCudaErrors(cuMemFree(dtimer));
checkCudaErrors(cuMemFree(dtimer));
long double avgElapsedClocks = 0; long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) { for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
} }
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks); printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -37,38 +37,41 @@
// time it takes to do that for each block. The timing results are stored // time it takes to do that for each block. The timing results are stored
// in device memory. // in device memory.
extern "C" __global__ void timedReduction(const float *input, float *output, extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
clock_t *timer) { {
// __shared__ float shared[2 * blockDim.x]; // __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[]; extern __shared__ float shared[];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock(); if (tid == 0)
timer[bid] = clock();
// Copy input. // Copy input.
shared[tid] = input[tid]; shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x]; shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads(); __syncthreads();
if (tid < d) { if (tid == 0)
float f0 = shared[tid]; timer[bid + gridDim.x] = clock();
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
} }

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,112 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>clock_nvrtc_vs2017</RootNamespace>
<ProjectName>clock_nvrtc</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="clock.cpp" />
<None Include="clock_kernel.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>clock_nvrtc_vs2019</RootNamespace>
<ProjectName>clock_nvrtc</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="clock.cpp" />
<None Include="clock_kernel.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "clock_nvrtc", "clock_nvrtc_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>clock_nvrtc_vs2022</RootNamespace>
<ProjectName>clock_nvrtc</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;$(CudaToolkitIncludeDir);$(CUDA_PATH)/include;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cuda.lib;nvrtc.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/clock_nvrtc.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration></CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="clock.cpp" />
<None Include="clock_kernel.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,340 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: concurrentKernels
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
concurrentKernels.o:concurrentKernels.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
concurrentKernels: concurrentKernels.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./concurrentKernels
testrun: build
clean:
rm -f concurrentKernels concurrentKernels.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/concurrentKernels
clobber: clean

View File

@ -1,87 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>concurrentKernels</name>
<cuda_api_list>
<toolkit>cudaStreamWaitEvent</toolkit>
<toolkit>cudaStreamDestroy</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaEventRecord</toolkit>
<toolkit>cudaMallocHost</toolkit>
<toolkit>cudaStreamCreate</toolkit>
<toolkit>cudaEventCreate</toolkit>
<toolkit>cudaEventElapsedTime</toolkit>
<toolkit>cudaEventSynchronize</toolkit>
<toolkit>cudaFreeHost</toolkit>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaEventCreateWithFlags</toolkit>
<toolkit>cudaEventDestroy</toolkit>
<toolkit>cudaMemcpyAsync</toolkit>
<toolkit>cudaGetDeviceProperties</toolkit>
<toolkit>cudaGetDevice</toolkit>
</cuda_api_list>
<description><![CDATA[This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="advanced">Performance Strategies</concept>
</keyconcepts>
<keywords>
<keyword>CUDA</keyword>
<keyword>Concurrent Kernels</keyword>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>concurrentKernels.cu</primary_file>
<scopes>
<scope>1:CUDA Advanced Topics</scope>
<scope>1:Performance Strategies</scope>
</scopes>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>Concurrent Kernels</title>
</entry>

View File

@ -1,70 +0,0 @@
# concurrentKernels - Concurrent Kernels
## Description
This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
## Key Concepts
Performance Strategies
## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le, armv7l
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaStreamWaitEvent, cudaStreamDestroy, cudaFree, cudaEventRecord, cudaMallocHost, cudaStreamCreate, cudaEventCreate, cudaEventElapsedTime, cudaEventSynchronize, cudaFreeHost, cudaMalloc, cudaEventCreateWithFlags, cudaEventDestroy, cudaMemcpyAsync, cudaGetDeviceProperties, cudaGetDevice
## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -1,228 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
//
// This sample demonstrates the use of streams for concurrent execution. It also
// illustrates how to introduce dependencies between CUDA streams with the
// cudaStreamWaitEvent function.
//
// Devices of compute capability 2.0 or higher can overlap the kernels
//
#include <cooperative_groups.h>
#include <stdio.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <helper_functions.h>
// This is a kernel that does no real work but runs at least for a specified
// number of clocks
__global__ void clock_block(clock_t *d_o, clock_t clock_count) {
unsigned int start_clock = (unsigned int)clock();
clock_t clock_offset = 0;
while (clock_offset < clock_count) {
unsigned int end_clock = (unsigned int)clock();
// The code below should work like
// this (thanks to modular arithmetics):
//
// clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock));
//
// Indeed, let m = 2^32 then
// end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock);
}
d_o[0] = clock_offset;
}
// Single warp reduction kernel
__global__ void sum(clock_t *d_clocks, int N) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ clock_t s_clocks[32];
clock_t my_sum = 0;
for (int i = threadIdx.x; i < N; i += blockDim.x) {
my_sum += d_clocks[i];
}
s_clocks[threadIdx.x] = my_sum;
cg::sync(cta);
for (int i = 16; i > 0; i /= 2) {
if (threadIdx.x < i) {
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
}
cg::sync(cta);
}
d_clocks[0] = s_clocks[0];
}
int main(int argc, char **argv) {
int nkernels = 8; // number of concurrent kernels
int nstreams = nkernels + 1; // use one more stream than concurrent kernel
int nbytes = nkernels * sizeof(clock_t); // number of data bytes
float kernel_time = 10; // time the kernel should run in ms
float elapsed_time; // timing variables
int cuda_device = 0;
printf("[%s] - Starting...\n", argv[0]);
// get number of kernels if overridden on the command line
if (checkCmdLineFlag(argc, (const char **)argv, "nkernels")) {
nkernels = getCmdLineArgumentInt(argc, (const char **)argv, "nkernels");
nstreams = nkernels + 1;
}
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
if ((deviceProp.concurrentKernels == 0)) {
printf("> GPU does not support concurrent kernel execution\n");
printf(" CUDA kernel runs will be serialized\n");
}
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
// allocate host memory
clock_t *a = 0; // pointer to the array data in host memory
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
// allocate device memory
clock_t *d_a = 0; // pointers to data and init value in the device memory
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
// allocate and initialize an array of stream handles
cudaStream_t *streams =
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++) {
checkCudaErrors(cudaStreamCreate(&(streams[i])));
}
// create CUDA event handles
cudaEvent_t start_event, stop_event;
checkCudaErrors(cudaEventCreate(&start_event));
checkCudaErrors(cudaEventCreate(&stop_event));
// the events are used for synchronization only and hence do not need to
// record timings this also makes events not introduce global sync points when
// recorded which is critical to get overlap
cudaEvent_t *kernelEvent;
kernelEvent = (cudaEvent_t *)malloc(nkernels * sizeof(cudaEvent_t));
for (int i = 0; i < nkernels; i++) {
checkCudaErrors(
cudaEventCreateWithFlags(&(kernelEvent[i]), cudaEventDisableTiming));
}
//////////////////////////////////////////////////////////////////////
// time execution with nkernels streams
clock_t total_clocks = 0;
#if defined(__arm__) || defined(__aarch64__)
// the kernel takes more time than the channel reset time on arm archs, so to
// prevent hangs reduce time_clocks.
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
#else
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif
cudaEventRecord(start_event, 0);
// queue nkernels in separate streams and record when they are done
for (int i = 0; i < nkernels; ++i) {
clock_block<<<1, 1, 0, streams[i]>>>(&d_a[i], time_clocks);
total_clocks += time_clocks;
checkCudaErrors(cudaEventRecord(kernelEvent[i], streams[i]));
// make the last stream wait for the kernel event to be recorded
checkCudaErrors(
cudaStreamWaitEvent(streams[nstreams - 1], kernelEvent[i], 0));
}
// queue a sum kernel and a copy back to host in the last stream.
// the commands in this stream get dispatched as soon as all the kernel events
// have been recorded
sum<<<1, 32, 0, streams[nstreams - 1]>>>(d_a, nkernels);
checkCudaErrors(cudaMemcpyAsync(
a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost, streams[nstreams - 1]));
// at this point the CPU has dispatched all work for the GPU and can continue
// processing other tasks in parallel
// in this sample we just wait until the GPU is done
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Expected time for serial execution of %d kernels = %.3fs\n", nkernels,
nkernels * kernel_time / 1000.0f);
printf("Expected time for concurrent execution of %d kernels = %.3fs\n",
nkernels, kernel_time / 1000.0f);
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
bool bTestResult = (a[0] > total_clocks);
// release resources
for (int i = 0; i < nkernels; i++) {
cudaStreamDestroy(streams[i]);
cudaEventDestroy(kernelEvent[i]);
}
free(streams);
free(kernelEvent);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
if (!bTestResult) {
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
printf("Test passed\n");
exit(EXIT_SUCCESS);
}

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,112 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>concurrentKernels_vs2017</RootNamespace>
<ProjectName>concurrentKernels</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="concurrentKernels.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>concurrentKernels_vs2019</RootNamespace>
<ProjectName>concurrentKernels</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="concurrentKernels.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,108 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>concurrentKernels_vs2022</RootNamespace>
<ProjectName>concurrentKernels</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="concurrentKernels.cu" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,346 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: cppIntegration
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
cppIntegration.o:cppIntegration.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
cppIntegration_gold.o:cppIntegration_gold.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
main.o:main.cpp
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
cppIntegration: cppIntegration.o cppIntegration_gold.o main.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./cppIntegration
testrun: build
clean:
rm -f cppIntegration cppIntegration.o cppIntegration_gold.o main.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cppIntegration
clobber: clean

View File

@ -1,72 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
<name>cppIntegration</name>
<cuda_api_list>
<toolkit>cudaMalloc</toolkit>
<toolkit>cudaFree</toolkit>
<toolkit>cudaMemcpy</toolkit>
</cuda_api_list>
<description><![CDATA[This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.]]></description>
<devicecompilation>whole</devicecompilation>
<includepaths>
<path>./</path>
<path>../</path>
<path>../../../Common</path>
</includepaths>
<keyconcepts>
<concept level="basic">CPP-CUDA Integration</concept>
</keyconcepts>
<keywords>
</keywords>
<libraries>
</libraries>
<librarypaths>
</librarypaths>
<nsight_eclipse>true</nsight_eclipse>
<primary_file>cppIntegration.cu</primary_file>
<scopes>
<scope>1:CUDA Basic Topics</scope>
</scopes>
<sm-arch>sm35</sm-arch>
<sm-arch>sm37</sm-arch>
<sm-arch>sm50</sm-arch>
<sm-arch>sm52</sm-arch>
<sm-arch>sm53</sm-arch>
<sm-arch>sm60</sm-arch>
<sm-arch>sm61</sm-arch>
<sm-arch>sm70</sm-arch>
<sm-arch>sm72</sm-arch>
<sm-arch>sm75</sm-arch>
<sm-arch>sm80</sm-arch>
<sm-arch>sm86</sm-arch>
<sm-arch>sm87</sm-arch>
<supported_envs>
<env>
<arch>x86_64</arch>
<platform>linux</platform>
</env>
<env>
<platform>windows7</platform>
</env>
<env>
<arch>x86_64</arch>
<platform>macosx</platform>
</env>
<env>
<arch>arm</arch>
</env>
<env>
<arch>sbsa</arch>
</env>
<env>
<arch>ppc64le</arch>
<platform>linux</platform>
</env>
</supported_envs>
<supported_sm_architectures>
<include>all</include>
</supported_sm_architectures>
<title>C++ Integration</title>
<type>exe</type>
</entry>

View File

@ -1,70 +0,0 @@
# cppIntegration - C++ Integration
## Description
This example demonstrates how to integrate CUDA into an existing C++ application, i.e. the CUDA entry point on host side is only a function which is called from C++ code and only the file containing this function is compiled with nvcc. It also demonstrates that vector types can be used from cpp.
## Key Concepts
CPP-CUDA Integration
## Supported SM Architectures
[SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 5.3 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) [SM 8.7 ](https://developer.nvidia.com/cuda-gpus)
## Supported OSes
Linux, Windows
## Supported CPU Architecture
x86_64, ppc64le, armv7l
## CUDA APIs involved
### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
cudaMalloc, cudaFree, cudaMemcpy
## Prerequisites
Download and install the [CUDA Toolkit 11.6](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## Build and Run
### Windows
The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
```
*_vs<version>.sln - for Visual Studio <version>
```
Each individual sample has its own set of solution files in its directory:
To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
> **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
### Linux
The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
```
$ cd <sample_dir>
$ make
```
The samples makefiles can take advantage of certain options:
* **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
`$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
* **dbg=1** - build with debug symbols
```
$ make dbg=1
```
* **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
```
$ make SMS="50 60"
```
* **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
```
$ make HOST_COMPILER=g++
```
## References (for more details)

View File

@ -1,172 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Example of integrating CUDA functions into an existing
* application / framework.
* Host part of the device code.
* Compiled with Cuda compiler.
*/
// System includes
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
// CUDA runtime
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
extern "C" void computeGold(char *reference, char *idata,
const unsigned int len);
extern "C" void computeGold2(int2 *reference, int2 *idata,
const unsigned int len);
///////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality
//! @param g_odata memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel(int *g_data) {
// write data to global memory
const unsigned int tid = threadIdx.x;
int data = g_data[tid];
// use integer arithmetic to process all four bytes with one thread
// this serializes the execution, but is the simplest solutions to avoid
// bank conflicts for this very low number of threads
// in general it is more efficient to process each byte by a separate thread,
// to avoid bank conflicts the access pattern should be
// g_data[4 * wtid + wid], where wtid is the thread id within the half warp
// and wid is the warp id
// see also the programming guide for a more in depth discussion.
g_data[tid] =
((((data << 0) >> 24) - 10) << 24) | ((((data << 8) >> 24) - 10) << 16) |
((((data << 16) >> 24) - 10) << 8) | ((((data << 24) >> 24) - 10) << 0);
}
///////////////////////////////////////////////////////////////////////////////
//! Demonstration that int2 data can be used in the cpp code
//! @param g_odata memory to process (in and out)
///////////////////////////////////////////////////////////////////////////////
__global__ void kernel2(int2 *g_data) {
// write data to global memory
const unsigned int tid = threadIdx.x;
int2 data = g_data[tid];
// use integer arithmetic to process all four bytes with one thread
// this serializes the execution, but is the simplest solutions to avoid
// bank conflicts for this very low number of threads
// in general it is more efficient to process each byte by a separate thread,
// to avoid bank conflicts the access pattern should be
// g_data[4 * wtid + wid], where wtid is the thread id within the half warp
// and wid is the warp id
// see also the programming guide for a more in depth discussion.
g_data[tid].x = data.x - data.y;
}
////////////////////////////////////////////////////////////////////////////////
//! Entry point for Cuda functionality on host side
//! @param argc command line argument count
//! @param argv command line arguments
//! @param data data to process on the device
//! @param len len of \a data
////////////////////////////////////////////////////////////////////////////////
extern "C" bool runTest(const int argc, const char **argv, char *data,
int2 *data_int2, unsigned int len) {
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
findCudaDevice(argc, (const char **)argv);
const unsigned int num_threads = len / 4;
assert(0 == (len % 4));
const unsigned int mem_size = sizeof(char) * len;
const unsigned int mem_size_int2 = sizeof(int2) * len;
// allocate device memory
char *d_data;
checkCudaErrors(cudaMalloc((void **)&d_data, mem_size));
// copy host memory to device
checkCudaErrors(cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice));
// allocate device memory for int2 version
int2 *d_data_int2;
checkCudaErrors(cudaMalloc((void **)&d_data_int2, mem_size_int2));
// copy host memory to device
checkCudaErrors(cudaMemcpy(d_data_int2, data_int2, mem_size_int2,
cudaMemcpyHostToDevice));
// setup execution parameters
dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1);
dim3 threads2(len, 1, 1); // more threads needed fir separate int2 version
// execute the kernel
kernel<<<grid, threads>>>((int *)d_data);
kernel2<<<grid, threads2>>>(d_data_int2);
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
// compute reference solutions
char *reference = (char *)malloc(mem_size);
computeGold(reference, data, len);
int2 *reference2 = (int2 *)malloc(mem_size_int2);
computeGold2(reference2, data_int2, len);
// copy results from device to host
checkCudaErrors(cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(data_int2, d_data_int2, mem_size_int2,
cudaMemcpyDeviceToHost));
// check result
bool success = true;
for (unsigned int i = 0; i < len; i++) {
if (reference[i] != data[i] || reference2[i].x != data_int2[i].x ||
reference2[i].y != data_int2[i].y) {
success = false;
}
}
// cleanup memory
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFree(d_data_int2));
free(reference);
free(reference2);
return success;
}

View File

@ -1,67 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Example of integrating CUDA functions into an existing
* application / framework.
* Reference solution computation.
*/
// Required header to support CUDA vector types
#include <vector_types.h>
////////////////////////////////////////////////////////////////////////////////
// export C interface
extern "C" void computeGold(char *reference, char *idata,
const unsigned int len);
extern "C" void computeGold2(int2 *reference, int2 *idata,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
//! Each element is multiplied with the number of threads / array length
//! @param reference reference data, computed but preallocated
//! @param idata input data as provided to device
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
void computeGold(char *reference, char *idata, const unsigned int len) {
for (unsigned int i = 0; i < len; ++i) reference[i] = idata[i] - 10;
}
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set for int2 version
//! Each element is multiplied with the number of threads / array length
//! @param reference reference data, computed but preallocated
//! @param idata input data as provided to device
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
void computeGold2(int2 *reference, int2 *idata, const unsigned int len) {
for (unsigned int i = 0; i < len; ++i) {
reference[i].x = idata[i].x - idata[i].y;
reference[i].y = idata[i].y;
}
}

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2017
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppIntegration", "cppIntegration_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,114 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cppIntegration_vs2017</RootNamespace>
<ProjectName>cppIntegration</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cppIntegration.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="cppIntegration.cu" />
<ClCompile Include="cppIntegration_gold.cpp" />
<ClCompile Include="main.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2019
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppIntegration", "cppIntegration_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,110 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cppIntegration_vs2019</RootNamespace>
<ProjectName>cppIntegration</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v142</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cppIntegration.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="cppIntegration.cu" />
<ClCompile Include="cppIntegration_gold.cpp" />
<ClCompile Include="main.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,20 +0,0 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2022
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cppIntegration", "cppIntegration_vs2022.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|x64 = Debug|x64
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View File

@ -1,110 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup>
<CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
</PropertyGroup>
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
<RootNamespace>cppIntegration_vs2022</RootNamespace>
<ProjectName>cppIntegration</ProjectName>
<CudaToolkitCustomDir />
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup>
<ConfigurationType>Application</ConfigurationType>
<CharacterSet>MultiByte</CharacterSet>
<PlatformToolset>v143</PlatformToolset>
<WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Debug'">
<UseDebugLibraries>true</UseDebugLibraries>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)'=='Release'">
<WholeProgramOptimization>true</WholeProgramOptimization>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets">
<Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<IntDir>$(Platform)/$(Configuration)/</IntDir>
<IncludePath>$(IncludePath)</IncludePath>
<CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
<CodeAnalysisRules />
<CodeAnalysisRuleAssemblies />
</PropertyGroup>
<PropertyGroup Condition="'$(Platform)'=='x64'">
<OutDir>../../../bin/win64/$(Configuration)/</OutDir>
</PropertyGroup>
<ItemDefinitionGroup>
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
<AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../../Common;</AdditionalIncludeDirectories>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
<OutputFile>$(OutDir)/cppIntegration.exe</OutputFile>
</Link>
<CudaCompile>
<CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
<AdditionalOptions>-Xcompiler "/wd 4819" --threads 0 </AdditionalOptions>
<Include>./;../../../Common</Include>
<Defines>WIN32</Defines>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
<ClCompile>
<Optimization>Disabled</Optimization>
<RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>true</GenerateDebugInformation>
<LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MTd</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
<ClCompile>
<Optimization>MaxSpeed</Optimization>
<RuntimeLibrary>MultiThreaded</RuntimeLibrary>
</ClCompile>
<Link>
<GenerateDebugInformation>false</GenerateDebugInformation>
<LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
</Link>
<CudaCompile>
<Runtime>MT</Runtime>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
<CudaCompile Include="cppIntegration.cu" />
<ClCompile Include="cppIntegration_gold.cpp" />
<ClCompile Include="main.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(CUDAPropsPath)\CUDA 11.6.targets" />
</ImportGroup>
</Project>

View File

@ -1,86 +0,0 @@
/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Example of integrating CUDA functions into an existing
* application / framework.
* CPP code representing the existing application / framework.
* Compiled with default CPP compiler.
*/
// includes, system
#include <iostream>
#include <stdlib.h>
// Required to include CUDA vector types
#include <cuda_runtime.h>
#include <vector_types.h>
#include <helper_cuda.h>
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
extern "C" bool runTest(const int argc, const char **argv, char *data,
int2 *data_int2, unsigned int len);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
// input data
int len = 16;
// the data has some zero padding at the end so that the size is a multiple of
// four, this simplifies the processing as each thread can process four
// elements (which is necessary to avoid bank conflicts) but no branching is
// necessary to avoid out of bounds reads
char str[] = {82, 111, 118, 118, 121, 42, 97, 121,
124, 118, 110, 56, 10, 10, 10, 10};
// Use int2 showing that CUDA vector types can be used in cpp code
int2 i2[16];
for (int i = 0; i < len; i++) {
i2[i].x = str[i];
i2[i].y = 10;
}
bool bTestResult;
// run the device part of the program
bTestResult = runTest(argc, (const char **)argv, str, i2, len);
std::cout << str << std::endl;
char str_device[16];
for (int i = 0; i < len; i++) {
str_device[i] = (char)(i2[i].x);
}
std::cout << str_device << std::endl;
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -1,340 +0,0 @@
################################################################################
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
################################################################################
#
# Makefile project only supported on Mac OS X and Linux Platforms)
#
################################################################################
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda
##############################
# start deprecated interface #
##############################
ifeq ($(x86_64),1)
$(info WARNING - x86_64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=x86_64 instead)
TARGET_ARCH ?= x86_64
endif
ifeq ($(ARMv7),1)
$(info WARNING - ARMv7 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=armv7l instead)
TARGET_ARCH ?= armv7l
endif
ifeq ($(aarch64),1)
$(info WARNING - aarch64 variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=aarch64 instead)
TARGET_ARCH ?= aarch64
endif
ifeq ($(ppc64le),1)
$(info WARNING - ppc64le variable has been deprecated)
$(info WARNING - please use TARGET_ARCH=ppc64le instead)
TARGET_ARCH ?= ppc64le
endif
ifneq ($(GCC),)
$(info WARNING - GCC variable has been deprecated)
$(info WARNING - please use HOST_COMPILER=$(GCC) instead)
HOST_COMPILER ?= $(GCC)
endif
ifneq ($(abi),)
$(error ERROR - abi variable has been removed)
endif
############################
# end deprecated interface #
############################
# architecture
HOST_ARCH := $(shell uname -m)
TARGET_ARCH ?= $(HOST_ARCH)
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le armv7l))
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 sbsa ppc64le))
TARGET_SIZE := 64
else ifneq (,$(filter $(TARGET_ARCH),armv7l))
TARGET_SIZE := 32
endif
else
TARGET_SIZE := $(shell getconf LONG_BIT)
endif
else
$(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
endif
# sbsa and aarch64 systems look similar. Need to differentiate them at host level for now.
ifeq ($(HOST_ARCH),aarch64)
ifeq ($(CUDA_PATH)/targets/sbsa-linux,$(shell ls -1d $(CUDA_PATH)/targets/sbsa-linux 2>/dev/null))
HOST_ARCH := sbsa
TARGET_ARCH := sbsa
endif
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-sbsa x86_64-ppc64le))
$(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
endif
endif
# When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
TARGET_ARCH = armv7l
endif
# operating system
HOST_OS := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
TARGET_OS ?= $(HOST_OS)
ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
$(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
endif
# host compiler
ifeq ($(TARGET_OS),darwin)
ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
HOST_COMPILER ?= clang++
endif
else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
ifeq ($(TARGET_OS),linux)
HOST_COMPILER ?= arm-linux-gnueabihf-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
else ifeq ($(TARGET_OS),android)
HOST_COMPILER ?= arm-linux-androideabi-g++
endif
else ifeq ($(TARGET_ARCH),aarch64)
ifeq ($(TARGET_OS), linux)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_OS),qnx)
ifeq ($(QNX_HOST),)
$(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
endif
ifeq ($(QNX_TARGET),)
$(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
endif
export QNX_HOST
export QNX_TARGET
HOST_COMPILER ?= $(QNX_HOST)/usr/bin/q++
else ifeq ($(TARGET_OS), android)
HOST_COMPILER ?= aarch64-linux-android-clang++
endif
else ifeq ($(TARGET_ARCH),sbsa)
HOST_COMPILER ?= aarch64-linux-gnu-g++
else ifeq ($(TARGET_ARCH),ppc64le)
HOST_COMPILER ?= powerpc64le-linux-gnu-g++
endif
endif
HOST_COMPILER ?= g++
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
# internal flags
NVCCFLAGS := -m${TARGET_SIZE}
CCFLAGS :=
LDFLAGS :=
# build flags
ifeq ($(TARGET_OS),darwin)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(HOST_ARCH)
else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
else ifeq ($(TARGET_OS),android)
LDFLAGS += -pie
CCFLAGS += -fpie -fpic -fexceptions
endif
ifneq ($(TARGET_ARCH),$(HOST_ARCH))
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/lib/aarch64-linux-gnu -L$(TARGET_FS)/lib/aarch64-linux-gnu
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L$(TARGET_FS)/usr/lib/aarch64-linux-gnu
LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
CCFLAGS += -isystem=$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include -I$(TARGET_FS)/usr/include/libdrm
CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu -I$(TARGET_FS)/usr/include/aarch64-linux-gnu
endif
endif
ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
NVCCFLAGS += -D_QNX_SOURCE
NVCCFLAGS += --qpp-config 8.3.0,gcc_ntoaarch64le
CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
LDFLAGS += -lsocket
LDFLAGS += -L/usr/lib/aarch64-qnx-gnu
CCFLAGS += "-Wl\,-rpath-link\,/usr/lib/aarch64-qnx-gnu"
ifdef TARGET_OVERRIDE
LDFLAGS += -lslog2
endif
ifneq ($(TARGET_FS),)
LDFLAGS += -L$(TARGET_FS)/usr/lib
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/lib"
LDFLAGS += -L$(TARGET_FS)/usr/libnvidia
CCFLAGS += "-Wl\,-rpath-link\,$(TARGET_FS)/usr/libnvidia"
CCFLAGS += -I$(TARGET_FS)/../include
endif
endif
endif
ifdef TARGET_OVERRIDE # cuda toolkit targets override
NVCCFLAGS += -target-dir $(TARGET_OVERRIDE)
endif
# Install directory of different arch
CUDA_INSTALL_TARGET_DIR :=
ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),sbsa-linux)
CUDA_INSTALL_TARGET_DIR = targets/sbsa-linux/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
else ifeq ($(TARGET_ARCH),ppc64le)
CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
BUILD_TYPE := debug
else
BUILD_TYPE := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
SAMPLE_ENABLED := 1
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I../../../Common
LIBRARIES :=
################################################################################
# Gencode arguments
ifeq ($(TARGET_ARCH),$(filter $(TARGET_ARCH),armv7l aarch64 sbsa))
SMS ?= 53 61 70 72 75 80 86 87
else
SMS ?= 35 37 50 52 60 61 70 75 80 86
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
ALL_CCFLAGS += --threads 0 --std=c++11
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
# Target rules
all: build
build: cppOverload
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
cppOverload.o:cppOverload.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
cppOverload: cppOverload.o
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
run: build
$(EXEC) ./cppOverload
testrun: build
clean:
rm -f cppOverload cppOverload.o
rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/cppOverload
clobber: clean

Some files were not shown because too many files have changed in this diff Show More