Compare commits

...

82 Commits

Author SHA1 Message Date
Rob Armstrong
8a9e2c830c
Update 1_Utilities/README.md to redirect bandwidthTest to NVBandwidth (#371) 2025-05-22 11:43:14 -07:00
Rob Armstrong
adacf1cffd
Merge pull request #368 from XSShawnZeng/master
Update the vulkan headers include sequence and the transpose code format check
2025-05-21 09:27:13 -07:00
shawnz
da3b7a2b3c Update the vulkanImageCUDA/vulkanImageCUDA.cu for Windows headers 2025-05-19 17:43:08 +08:00
shawnz
5987a9e9fa Update transpose for code format check 2025-05-19 17:38:42 +08:00
shawnz
107f3f537f Update the include files sequence for vulkan samples on Windows 2025-05-19 17:38:22 +08:00
Francesco Rizzi
b530f1cf42
Fix bug in 6_Performance/transpose: copy sharedmem kernel (#363)
Update kernel loop bounds handling, main loop data copy to avoid incorrect reuse of output results.

---------

Authored-by: Francesco Rizzi <francesco.rizzi@ng-analytics.com>
2025-05-05 08:43:23 -07:00
Rob Armstrong
cab7c66b4f Update pre-config to include Python and JSON for EOL, whitespace checks 2025-05-01 10:17:42 -07:00
Rob Armstrong
8d400cfb7f Additional minor changes to run_tests.py output formatting 2025-05-01 10:14:09 -07:00
Rob Armstrong
6d6d964f97 Minor changes to run_tests.py output formatting 2025-05-01 09:54:25 -07:00
Rob Armstrong
ab68d58d59 Remove unused bin/x86_64 directory hierarchy 2025-05-01 09:53:54 -07:00
Rob Armstrong
c70d79cf3b Final 12.9 README updates 2025-05-01 09:39:06 -07:00
Rob Armstrong
14b1bfdcc4 Replace README references to "CUDA Toolkit 12.5" with general "CUDA Toolkit" 2025-04-30 09:46:45 -07:00
Rob Armstrong
c14a0114d6 Some samples require multiple GPUs. Update 'run_tests.py' to skip them on single- or no-GPU systems. 2025-04-30 09:45:20 -07:00
Rob Armstrong
ee15cc0fe2 Merge branch 'shawnz_bugs_fix' into 'master'
Bug fix for 5241914, 5164417 and 5097376

See merge request cuda-samples/cuda-samples!107
2025-04-28 08:53:11 -07:00
shawnz
3438fd4875 Update README for OpenMP 2025-04-28 23:44:45 +08:00
shawnz
b27b55ec70 Bug 5241914: Fix the error message for cuSolverDn_LinearSolver 2025-04-27 16:57:02 +08:00
shawnz
49159f3739 Bug 5164417 and 5097376: Fix the OpenMP issue finding issue for MSVC and Glang 2025-04-27 16:50:12 +08:00
Rob Armstrong
1680a1dc7f Update Windows FreeImage configuration instructions in README.md 2025-04-21 09:20:22 -07:00
Rob Armstrong
49daf0e4e0 Merge Bug 5199167: Fix the includes issue for 5_Domain_Specific\simpleD3D12
See merge request cuda-samples/cuda-samples!106
2025-04-21 08:11:52 -07:00
shawnz
a45fd3bd7c Bug 5199167: Fix the includes issue for 5_Domain_Specific\simpleD3D12 2025-04-21 11:52:33 +08:00
Rob Armstrong
0345908807 Update run_tests.py to enable multithreading 2025-04-07 08:48:44 -07:00
Rob Armstrong
3b9c8ce2e9 Merge branch 'shawnz_bugs_fix' into 'master'
Bug 5207005: Append pid in shmName for Linux only as this is for MIG scenario

See merge request cuda-samples/cuda-samples!100
2025-04-07 08:21:40 -07:00
shawnz
e77d6eb5ab Bug 5207005: Append pid in shmName for Linux only as this is for MIG scenario 2025-04-07 17:17:17 +08:00
Rob Armstrong
ac700327a2 Add folders to CMakeLists.txt for supporting generators and IDEs 2025-04-05 09:54:24 -07:00
Rob Armstrong
17703dd426 Merge branch 'shawnz_bugs_fix' into 'master'
Bug 5196977: Update includes for nbody

See merge request cuda-samples/cuda-samples!98
2025-04-03 01:16:20 -07:00
shawnz
a32d5badf7 Bug 5196977: Update includes for nbody 2025-04-03 15:30:05 +08:00
Rob Armstrong
1fd22429c3 Merge branch 'shawnz_bugs_fix' into 'master'
Change for fixing bugs: 5196977, 4914019, 4191696 and 5199167 .

See merge request cuda-samples/cuda-samples!97
2025-04-02 22:28:17 -07:00
Rob Armstrong
00ac0a1673 Remove bandwidthTest subdirectory from CMakeLists.txt 2025-04-02 22:27:30 -07:00
shawnz
b013387a39 Update code format 2025-04-03 11:23:26 +08:00
Rob Armstrong
9d921e0fe7 Add CONTRIBUTING.md 2025-04-02 11:29:16 -07:00
Rob Armstrong
7d1730f348 Remove outdated bandwidthTest sample 2025-04-02 11:19:48 -07:00
shawnz
718fe6486d Bug 5199167: Adjust the include header files sequence for simpleD3D11/simpleD3D11Texture 2025-04-02 15:10:29 +08:00
shawnz
ad9908e32b Bug4914019 & 4191696: Append pid in shmName for MIG multiple thread scenario 2025-04-02 11:20:09 +08:00
shawnz
952d6edf92 Bug 5196977: Include helper_gl.h before cuda_gl_interop.h 2025-04-01 16:07:32 +08:00
Rob Armstrong
685709bfc7 Merge branch 'shawnz_bugs_fix' into 'master'
Bug fix for bug 5194249, 5188945 and 5164374

See merge request cuda-samples/cuda-samples!95
2025-03-31 08:00:50 -07:00
shawnz
0c92c34ca9 Bug 5164374: Remove the register keyword has been deprecated and removed from the C++17 standard 2025-03-31 15:13:56 +08:00
shawnz
0d82634f70 5188945: Add freeglut and glew64 .dll files for minsizeRel/RelWithDebInfo build 2025-03-31 15:07:29 +08:00
shawnz
4abbdf4e80 Bug 5194249: Need to include cuda_runtime.h for cudaNvSci after the clang format change 2025-03-31 14:57:31 +08:00
Rob Armstrong
914ca00f89 Small update to README.md to clarify test script usage. 2025-03-28 15:16:10 -07:00
Rob Armstrong
c8034f368a Add helper utility to test run all built samples (see README.md for usage details) 2025-03-28 15:07:07 -07:00
Rob Armstrong
ceab6e8bcc Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks. 2025-03-27 10:30:07 -07:00
Rob Armstrong
2cd58fbc9a Update README version for 12.9 2025-03-26 10:24:22 -07:00
Rob Armstrong
c0ab53f986 Update all sample CMakeLists.txt to include ENABLE_CUDA_DEBUG flag to enable cuda-gdb 2025-03-26 10:08:59 -07:00
Rob Armstrong
b87c243bbb Add -lineinfo flag to all targets to include line information for developer tools 2025-03-26 09:44:20 -07:00
Rob Armstrong
e214cd29aa Update gencode arguments for separate kernel fatbin builds 2025-03-26 09:28:37 -07:00
Rob Armstrong
06d72496c2 Merge branch 'shawnz_tegra_crossbuild_toolchain' into 'master'
Bug 5133197: Add cmake toolchain and and update the CMakeList of some sample...

See merge request cuda-samples/cuda-samples!94
2025-03-25 14:52:02 -07:00
shawnz
2848d3bd21 Bug 5176886: Enable nvJPEG samples for aarch64 2025-03-21 13:02:14 +08:00
shawnz
bd0f630bf4 Bug 5133197: Add cmake toolchain and and update the CMakeList of some sample for tegra linux cross build 2025-03-20 12:43:44 +08:00
shawnz
ab9166a6b2 Bug 5139353 and 5139213: Enhancement for streamOrderedAllocationIPC 2025-03-12 15:28:54 +08:00
Rob Armstrong
c90a1c6981 Merge public repo changes 2025-03-08 08:30:35 -08:00
Rob Armstrong
9370f11e69 graphConditionalNodes: Additional tweaks to launch dimension initialization (#348) 2025-03-05 18:18:37 -08:00
Rob Armstrong
291435e0b4
graphConditionalNodes: Additional tweaks to launch dimension initialization (#348) 2025-03-05 18:17:27 -08:00
Rob Armstrong
8d901e745d graphConditionalNodes: Change launch dimension initialization for better cross-platform compatibility (#346) 2025-03-05 08:33:35 -08:00
Rob Armstrong
990ebc01c2
graphConditionalNodes: Change launch dimension initialization for better cross-platform compatibility (#346) 2025-03-05 08:32:58 -08:00
Shawn Zeng
9adce9d9f2 Update file CMakeLists.txt 2025-03-03 19:19:50 -08:00
Rob Armstrong
bcad2c9e61 graphConditionalNodes: Add switch, while, if/else conditional examples and minor cleanup (#344) 2025-03-03 17:50:22 -08:00
Rob Armstrong
e7b23470d5
graphConditionalNodes: Add switch, while, if/else conditional examples and minor cleanup (#344) 2025-03-03 17:49:17 -08:00
Shawn Zeng
310e7f2a11 Bug 5143332: Remove the redundant content in 0_Introduction/CMakeLists.txt 2025-03-03 17:37:48 -08:00
Shawn Zeng
7f0f63f311 Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP 2025-02-27 03:01:47 -08:00
Shawn Zeng
acd3a015c8 Revert "Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP"
This reverts commit a9869fd6eaeecc748fc5f10f4b331fa41efbdaca
2025-02-27 02:48:03 -08:00
shawnz
a9869fd6ea Bug 5034785: Update all non-ctx nppi APIs to ctx APIs as per latest change on NPP 2025-02-27 18:43:53 +08:00
XSShawnZeng
3e8f91d1a1
Several small bug fixes for Windows platforms
* Enhancement for GLFW include and lib search

* Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp

* Update CMakelist.txt for the sample 0_Introduction/template

* Copy .dll to correct dir for 5_Domain_Specific/Mandelbrot

* Fix typo

* Update changelog for cudaNvSciBufMultiplanar
2025-02-26 08:23:39 -08:00
Jonathan Bentz
f3b7c41ad6
cudaNvSci: Update README.md fixing typo (#337)
Fixes #193
2025-02-21 09:21:43 -08:00
Jonathan Bentz
29fb758e62
conjugateGradient: Ensure allocated memory is freed (#336)
Fixes #202
2025-02-21 09:20:53 -08:00
Jonathan Bentz
3bc08136ff
Update README.md link for sortingNetworks (#335)
Fixes #302
2025-02-21 09:19:21 -08:00
Jonathan Bentz
85eefa06c4
boxFilter: Remove unused parameter (#338)
Fixes: #122
2025-02-21 09:17:45 -08:00
XSShawnZeng
c357dd1e6b
Fixing issue #321: A potential bug in memMapIPCDrv/memMapIpc.cpp (#334) 2025-02-21 09:14:25 -08:00
Jonathan Bentz
efb46383e0
Transpose: Change TILE_DIM to 32 to fix bank conflicts
Fixes #175
2025-02-20 15:46:44 -08:00
XSShawnZeng
8d564d5e3a
Enhancement for GLFW include and lib search (#331)
Fixes NVIDIA bug 5115098
2025-02-20 08:06:40 -08:00
Jake Hemstad
37c5bcbef4 Update kernels.cuh 2025-02-19 17:33:10 -08:00
Rob Armstrong
940a4c7a91
memMapIpc: Resolve build-time warnings and minor potential issues (#329)
* Fix compute performance calculation type casting in gpuGetMaxGflopsDeviceIdDRV() for #109

* 3_CUDA_Features/memMapIPCDrv: Increase procIdx buffer size to prevent potential buffer overflow

* memMapIPCDrv: Fix memory leaks and improve header inclusion

- Remove redundant string.h header
- Add memory cleanup for dynamically allocated JIT options and log buffer
- Fix printf format specifier for unsigned long long
2025-02-19 15:52:20 -08:00
ohmaya
61bd39800d
simplePrintf.cu: "Compute capability" text (#299)
Compute %d.%d capability => Compute capability %d.%d
2025-02-19 15:22:34 -08:00
Rob Armstrong
8a96d2eee7
Fix compute performance calculation type casting in gpuGetMaxGflopsDeviceIdDRV() for #109 2025-02-19 10:43:18 -08:00
Rob Armstrong
e762d58260
Merge pull request #247 from sangeetsatheesh/master
Fix typo from Open issue #161
2025-02-18 17:22:48 -08:00
Rob Armstrong
8fd1701744
Merge branch 'master' into master 2025-02-18 17:22:04 -08:00
Rob Armstrong
94765c1597
Fix minor typo in README.md (#326) 2025-02-18 17:14:14 -08:00
Rob Armstrong
c87881f02c
Update matrix multiplication sample README references (#325)
- Clarify reference to Shared Memory section in CUDA programming guide
- Update cuBLAS interface version description
- Add hyperlink to Shared Memory documentation
2025-02-18 14:02:59 -08:00
Rob Armstrong
25400b6b3c
Merge pull request #287 from steffen-v/patch-1
fix "gridy" comandline argument for initMC
2025-02-18 13:30:27 -08:00
Rob Armstrong
e24f62e28c
Fix README.md version number typo
Fix inadvertent reference to prior release in README.md
2025-02-15 13:37:51 -08:00
steffen-v
22424227e7
fix "gridy" comandline argument for initMC 2024-07-26 14:42:05 +02:00
Sangeet S
42ff742bf5
Merge pull request #1 from sangeetsatheesh/sangeetsatheesh-fix-typo
Fix typo #161
2024-01-17 13:16:53 -05:00
Sangeet S
8ccb13c6f0
Fix typo #161
Fix typo in line 14 from "simple exemple" to simple "example"
2024-01-17 13:16:01 -05:00
1375 changed files with 108912 additions and 258785 deletions

49
.clang-format Normal file
View File

@ -0,0 +1,49 @@
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveDeclarations: Consecutive
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Left
AlignOperands: AlignAfterOperator
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterExternBlock: true
AfterFunction: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
IndentBraces: false
BreakBeforeBraces: Custom
BreakBeforeConceptDeclarations: true
BreakBeforeBinaryOperators: NonAssignment
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
ColumnLimit: 120
DerivePointerAlignment: false
FixNamespaceComments: true
IncludeCategories:
- Regex: '^<.*>'
Priority: 1
- Regex: '^".*"'
Priority: 2
SortIncludes: true
IncludeBlocks: Regroup
IndentWidth: 4
MaxEmptyLinesToKeep: 2
PointerAlignment: Right
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
Standard: c++17
TabWidth: 4
UseTab: Never
...

3
.gitignore vendored
View File

@ -1,3 +1,6 @@
build
.vs
.clangd
test
settings.json
launch.json

106
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,106 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
ci:
autofix_commit_msg: |
[pre-commit.ci] auto code formatting
autofix_prs: false
autoupdate_branch: ''
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
autoupdate_schedule: quarterly
skip: []
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$|
.*\.py$|
.*\.json$
)
- id: mixed-line-ending
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$|
.*\.py$|
.*\.json$
)
- id: trailing-whitespace
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$|
.*\.py$|
.*\.json$
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.6
hooks:
- id: clang-format
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$
)
exclude: |
(?x)^(
Common/.*
)
args: ["-fallback-style=none", "-style=file", "-i"]

View File

@ -1,5 +1,15 @@
## Changelog
### CUDA 12.9
* Updated toolchain for cross-compilation for Tegra Linux platforms.
* Added `run_tests.py` utility to exercise all samples. See README.md for details
* Repository has been updated with consistent code formatting across all samples
* Many small code tweaks and bug fixes (see commit history for details)
* Removed the following outdated samples:
* `1_Utilities`
* `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth
testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth)
### CUDA 12.8
* Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
* Removed the following outdated samples:
@ -36,6 +46,7 @@
* `cuDLALayerwiseStatsHybrid`
* `cuDLALayerwiseStatsStandalone`
* `cuDLAStandaloneMode`
* `cudaNvSciBufMultiplanar`
* `cudaNvSciNvMedia`
* `fluidsGLES`
* `nbody_opengles`

View File

@ -16,8 +16,10 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")

103
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,103 @@
# Contributing to the CUDA Samples
Thank you for your interest in contributing to the CUDA Samples!
## Getting Started
1. **Fork & Clone the Repository**:
Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository).
## Making Changes
1. **Create a New Branch**:
```bash
git checkout -b your-feature-branch
```
2. **Make Changes**.
3. **Build and Test**:
Ensure changes don't break existing functionality by building and running tests.
For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below.
4. **Commit Changes**:
```bash
git commit -m "Brief description of the change"
```
## Building and Testing
For information on building a running tests on the samples, please refer to the main [README](README.md)
## Creating a Pull Request
1. Push changes to your fork
2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request.
3. Describe the purpose and context of the changes in the pull request description.
## Code Formatting (pre-commit hooks)
The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter
versions and options are aligned for all developers. Additionally, there is a CI check in place to
enforce that committed code follows our standards.
The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`.
For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
To use `pre-commit`, install via `conda` or `pip`:
```bash
conda config --add channels conda-forge
conda install pre-commit
```
```bash
pip install pre-commit
```
Then run pre-commit hooks before committing code:
```bash
pre-commit run
```
By default, pre-commit runs on staged files (only changes and additions that will be committed).
To run pre-commit checks on all files, execute:
```bash
pre-commit run --all-files
```
Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
```bash
pre-commit install
```
Now code linters and formatters will be run each time you commit changes.
You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note
that this may result in pull requests being rejected if subsequent checks fail.
## Review Process
Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code.
For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/).
Further recommended reading for successful PR reviews:
- [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/)
- [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/)
## Thank You
Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration!

View File

@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
}
unsigned long long compute_perf =
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
((unsigned long long)multiProcessorCount * sm_per_multiproc *
clockRate);
if (compute_perf > max_compute_perf) {

View File

@ -258,7 +258,7 @@ namespace nv
s[2] = &r3[0];
s[3] = &r4[0];
register int i,j,p,jj;
int i,j,p,jj;
for (i=0; i<4; i++)
{

189
README.md
View File

@ -1,6 +1,6 @@
# CUDA Samples
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.6](https://developer.nvidia.com/cuda-downloads).
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
## Release Notes
@ -14,7 +14,7 @@ This section describes the release notes for the CUDA Samples on GitHub only.
### Prerequisites
Download and install the [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
### Getting the CUDA Samples
@ -72,6 +72,17 @@ Open the generated solution file CUDA_Samples.sln in Visual Studio. Build the sa
Run the samples from the output directories specified in Visual Studio.
### Enabling On-GPU Debugging
NVIDIA GPUs support on-GPU debugging through cuda-gdb. Enabling this may significantly affect application performance as certain compiler optimizations are disabled
in this configuration, hence it's not on by default. Enablement of on-device debugging is controlled via the `-G` switch to nvcc.
To enable cuda-gdb for samples builds, define the `ENABLE_CUDA_DEBUG` flag on the CMake command line. For example:
```
cmake -DENABLE_CUDA_DEBUG=True ...
```
### Platform-Specific Samples
Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
@ -94,9 +105,9 @@ Navigate to the root of the cloned repository and create a build directory:
```
mkdir build && cd build
```
Configure the project with CMake, specifying the Tegra toolchain file:
Configure the project with CMake, specifying the Tegra toolchain file. And you can use -DTARGET_FS to point to the target file system root path for necessary include and library files:
```
cmake .. -DCMAKE_TOOLCHAIN_FILE=/path/to/tegra/toolchain.cmake
cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake -DTARGET_FS=/path/to/target/system/file/system
```
Build the samples:
```
@ -111,7 +122,7 @@ Instead of being in the default location, `/usr/local/cuda/include` or `/usr/loc
`/usr/local/cuda/<ARCH>/targets/aarch64-linux/lib`
and
`/usr/local/cuda-12.8/<ARCH>/include`
`/usr/local/cuda/<ARCH>/include`
An example build might look like this:
@ -128,6 +139,168 @@ Note that in the current branch sample cross-compilation for QNX is not fully va
near future with QNX cross-compilation instructions. In the meantime, if you want to cross-compile for QNX please check out one
of the previous tags prior to the CMake build system transition in 12.8.
## Running All Samples as Tests
It's important to note that the CUDA samples are _not_ intended as a validation suite for CUDA. They do not cover corner cases, they do not completely cover the
runtime and driver APIs, are not intended for performance benchmarking, etc. That said, it can sometimes be useful to run all of the samples as a quick sanity check and
we provide a script to do so, `run_tests.py`.
This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts
the following command line arguments:
| Switch | Purpose | Example |
| ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- |
| --dir | Specify the root directory to search for executables (recursively) | --dir ./build/Samples |
| --config | JSON configuration file for executable arguments | --config test_args.json |
| --output | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test |
| --args | Global arguments to pass to all executables (not currently used) | --args arg_1 arg_2 ... |
| --parallel | Number of applications to execute in parallel. | --parallel 8 |
Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows).
The script returns 0 on success, or the first non-zero error code encountered during testing on failure. It will also print a condensed list of samples that failed, if any.
There are three primary modes of configuration:
**Skip**
An executable configured with "skip" will not be executed. These generally rely on having attached graphical displays and are not suited to this kind of automation.
Configuration example:
```json
"fluidsGL": {
"skip": true
}
```
You will see:
```
Skipping fluidsGL (marked as skip in config)
```
**Single Run**
For executables to run one time only with arguments, specify each argument as a list entry. Each entry in the JSON file will be appended to the command line, separated
by a space.
All applications execute from their current directory, so all paths are relative to the application's location.
Note that if an application needs no arguments, this entry is optional. An executable found without a matching entry in the JSON will just run as `./application` from its
current directory.
Configuration example:
```json
"ptxgen": {
"args": [
"test.ll",
"-arch=compute_75"
]
}
```
You will see:
```
Running ptxgen
Command: ./ptxgen test.ll -arch=compute_75
Test completed with return code 0
```
**Multiple Runs**
For executables to run multiple times with different command line arguments, specify any number of sets of args within a "runs" list.
As with single runs, all applications execute from their current directory, so all paths are relative to the application's location.
Configuration example:
```json
"recursiveGaussian": {
"runs": [
{
"args": [
"-sigma=10",
"-file=data/ref_10.ppm"
]
},
{
"args": [
"-sigma=14",
"-file=data/ref_14.ppm"
]
},
{
"args": [
"-sigma=18",
"-file=data/ref_18.ppm"
]
},
{
"args": [
"-sigma=22",
"-file=data/ref_22.ppm"
]
}
]
}
```
You will see:
```
Running recursiveGaussian (run 1/4)
Command: ./recursiveGaussian -sigma=10 -file=data/ref_10.ppm
Test completed with return code 0
Running recursiveGaussian (run 2/4)
Command: ./recursiveGaussian -sigma=14 -file=data/ref_14.ppm
Test completed with return code 0
Running recursiveGaussian (run 3/4)
Command: ./recursiveGaussian -sigma=18 -file=data/ref_18.ppm
Test completed with return code 0
Running recursiveGaussian (run 4/4)
Command: ./recursiveGaussian -sigma=22 -file=data/ref_22.ppm
Test completed with return code 0
```
### Example Usage
Here is an example set of commands to build and test all of the samples.
First, build:
```bash
mkdir build
cd build
cmake ..
make -j$(nproc)
```
Now, return to the samples root directory and run the test script:
```bash
cd ..
python3 run_tests.py --output ./test --dir ./build/Samples --config test_args.json
```
If all applications run successfully, you will see something similar to this (the specific number of samples will depend on your build type
and system configuration):
```
Test Summary:
Ran 199 test runs for 180 executables.
All test runs passed!
```
If some samples fail, you will see something like this:
```
Test Summary:
Ran 199 test runs for 180 executables.
Failed runs (2):
bicubicTexture (run 1/5): Failed (code 1)
Mandelbrot (run 1/2): Failed (code 1)
```
You can inspect the stdout logs in the output directory (generally `APM_<application_name>.txt` or `APM_<application_name>.run<n>.txt`) to help
determine what may have gone wrong from the output logs. Please file issues against the samples repository if you believe a sample is failing
incorrectly on your system.
## Samples list
### [0. Introduction](./Samples/0_Introduction/README.md)
@ -170,7 +343,7 @@ These third-party dependencies are required by some CUDA samples. If available,
FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.
To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `../../../Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFREEIMAGE_INCLUDE_DIR` and `-DFREEIMAGE_LIBRARY` options.
To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `./Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFreeImage_INCLUDE_DIR` and `-DFreeImage_LIBRARY` options.
#### Message Passing Interface
@ -203,11 +376,11 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
#### GLFW
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
#### OpenMP
OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/).
OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). For compilers such as clang, `libomp.so` and other components for LLVM must be installed separated. You will also need to set additional flags in your CMake configuration files, such as: `-DOpenMP_CXX_FLAGS="-fopenmp=libomp" -DOpenMP_CXX_LIB_NAMES="omp" -DOpenMP_omp_LIBRARY="/path/to/libomp.so"`.
#### Screen

View File

@ -1,20 +1,3 @@
cmake_minimum_required(VERSION 3.20)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
project(simpleCallback LANGUAGES C CXX CUDA)
find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
endif()
add_subdirectory(UnifiedMemoryStreams)
add_subdirectory(asyncAPI)
add_subdirectory(clock)
@ -55,6 +38,7 @@ add_subdirectory(simpleTexture3D)
add_subdirectory(simpleTextureDrv)
add_subdirectory(simpleVoteIntrinsics)
add_subdirectory(simpleZeroCopy)
add_subdirectory(template)
add_subdirectory(systemWideAtomics)
add_subdirectory(vectorAdd)
add_subdirectory(vectorAddDrv)

View File

@ -10,15 +10,21 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
include_directories(../../../Common)
# Source file
find_package(OpenMP REQUIRED)
if(CMAKE_GENERATOR MATCHES "Visual Studio")
find_package(OpenMP REQUIRED C CXX)
else()
find_package(OpenMP REQUIRED)
endif()
if(${OpenMP_FOUND})
# Add target for UnifiedMemoryStreams

View File

@ -28,7 +28,7 @@ cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSe
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -31,10 +31,10 @@
*/
// system includes
#include <algorithm>
#include <cstdio>
#include <ctime>
#include <vector>
#include <algorithm>
#ifdef USE_PTHREADS
#include <pthread.h>
#else
@ -51,291 +51,287 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
// functions
void srand48(long seed) { srand((unsigned int)seed); }
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif
const char *sSDKname = "UnifiedMemoryStreams";
// simple task
template <typename T>
struct Task {
unsigned int size, id;
T *data;
T *result;
T *vector;
template <typename T> struct Task
{
unsigned int size, id;
T *data;
T *result;
T *vector;
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
// allocate unified memory -- the operation performed in this example will
// be a DGEMV
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
}
~Task() {
// ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data));
checkCudaErrors(cudaFree(result));
checkCudaErrors(cudaFree(vector));
}
void allocate(const unsigned int s, const unsigned int unique_id) {
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
Task()
: size(0)
, id(0)
, data(NULL)
, result(NULL)
, vector(NULL) {};
Task(unsigned int s)
: size(s)
, id(0)
, data(NULL)
, result(NULL)
{
// allocate unified memory -- the operation performed in this example will
// be a DGEMV
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
}
for (unsigned int i = 0; i < size; i++) {
result[i] = 0.;
vector[i] = drand48();
~Task()
{
// ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data));
checkCudaErrors(cudaFree(result));
checkCudaErrors(cudaFree(vector));
}
void allocate(const unsigned int s, const unsigned int unique_id)
{
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
}
for (unsigned int i = 0; i < size; i++) {
result[i] = 0.;
vector[i] = drand48();
}
}
}
};
#ifdef USE_PTHREADS
struct threadData_t {
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
};
typedef struct threadData_t threadData;
#endif
// simple host dgemv: assume data is in row-major format and square
template <typename T>
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
// rows
for (int i = 0; i < n; i++) {
result[i] *= beta;
template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
{
// rows
for (int i = 0; i < n; i++) {
result[i] *= beta;
for (int j = 0; j < n; j++) {
result[i] += A[i * n + j] * x[j];
for (int j = 0; j < n; j++) {
result[i] += A[i * n + j] * x[j];
}
}
}
}
// execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void *execute(void *inpArgs) {
threadData *dataPtr = (threadData *)inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid;
void *execute(void *inpArgs)
{
threadData *dataPtr = (threadData *)inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid;
for (int i = 0; i < dataPtr->taskSize; i++) {
Task<double> &t = dataPtr->TaskListPtr[i];
for (int i = 0; i < dataPtr->taskSize; i++) {
Task<double> &t = dataPtr->TaskListPtr[i];
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
t.size);
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
// attach managed memory to a (dummy) stream to allow host access while
// the device is running
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
} else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to a (dummy) stream to allow host access while
// the device is running
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
}
else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
}
}
}
pthread_exit(NULL);
pthread_exit(NULL);
}
#else
template <typename T>
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
int tid) {
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
t.size);
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
// attach managed memory to a (dummy) stream to allow host access while the
// device is running
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
} else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to a (dummy) stream to allow host access while the
// device is running
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
}
else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
}
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
}
}
#endif
// populate a list of tasks with random sizes
template <typename T>
void initialise_tasks(std::vector<Task<T> > &TaskList) {
for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size
int size;
size = std::max((int)(drand48() * 1000.0), 64);
TaskList[i].allocate(size, i);
}
template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
{
for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size
int size;
size = std::max((int)(drand48() * 1000.0), 64);
TaskList[i].allocate(size, i);
}
}
int main(int argc, char **argv) {
// set device
cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
int main(int argc, char **argv)
{
// set device
cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n");
if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED);
}
exit(EXIT_WAIVED);
}
if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode
fprintf(stderr,
"This sample requires a device in either default or process "
"exclusive mode\n");
if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode
fprintf(stderr,
"This sample requires a device in either default or process "
"exclusive mode\n");
exit(EXIT_WAIVED);
}
exit(EXIT_WAIVED);
}
// randomise task sizes
int seed = (int)time(NULL);
srand48(seed);
// randomise task sizes
int seed = (int)time(NULL);
srand48(seed);
// set number of threads
const int nthreads = 4;
// set number of threads
const int nthreads = 4;
// number of streams = number of threads
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
// number of streams = number of threads
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
for (int i = 0; i < nthreads + 1; i++) {
checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cublasCreate(&handles[i]));
}
for (int i = 0; i < nthreads + 1; i++) {
checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cublasCreate(&handles[i]));
}
// create list of N tasks
unsigned int N = 40;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
// create list of N tasks
unsigned int N = 40;
std::vector<Task<double>> TaskList(N);
initialise_tasks(TaskList);
printf("Executing tasks on host / device\n");
printf("Executing tasks on host / device\n");
// run through all tasks using threads and streams
#ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
for (int i = 0; i < nthreads; i++) {
checkCudaErrors(cudaSetDevice(dev_id));
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles;
for (int i = 0; i < nthreads; i++) {
checkCudaErrors(cudaSetDevice(dev_id));
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles;
if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads)];
} else {
if (i == nthreads - 1) {
InputToThreads[i].taskSize =
(TaskList.size() / nthreads) + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads) +
(TaskList.size() % nthreads)];
} else {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads)];
}
if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
}
else {
if (i == nthreads - 1) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
}
else {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
#else
omp_set_num_threads(nthreads);
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i = 0; i < TaskList.size(); i++) {
checkCudaErrors(cudaSetDevice(dev_id));
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
for (int i = 0; i < TaskList.size(); i++) {
checkCudaErrors(cudaSetDevice(dev_id));
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize();
cudaDeviceSynchronize();
// Destroy CUDA Streams, cuBlas handles
for (int i = 0; i < nthreads + 1; i++) {
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
// Destroy CUDA Streams, cuBlas handles
for (int i = 0; i < nthreads + 1; i++) {
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
// Free TaskList
std::vector<Task<double> >().swap(TaskList);
// Free TaskList
std::vector<Task<double>>().swap(TaskList);
printf("All Done!\n");
exit(EXIT_SUCCESS);
printf("All Done!\n");
exit(EXIT_SUCCESS);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaPro
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -38,105 +38,107 @@
#include <stdio.h>
// includes CUDA Runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper utility functions
#include <helper_functions.h> // helper utility functions
__global__ void increment_kernel(int *g_data, int inc_value) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
__global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
}
bool correct_output(int *data, const int n, const int x) {
for (int i = 0; i < n; i++)
if (data[i] != x) {
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
return false;
bool correct_output(int *data, const int n, const int x)
{
for (int i = 0; i < n; i++)
if (data[i] != x) {
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
}
int main(int argc, char *argv[])
{
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv);
// get device name
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
}
return true;
}
int main(int argc, char *argv[]) {
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv);
// get device name
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
}
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
counter);
// check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFree(d_a));
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
// check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFree(d_a));
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaMalloc, cudaMemcpy, cudaFree
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -48,43 +48,46 @@
// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.
__global__ static void timedReduction(const float *input, float *output,
clock_t *timer) {
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
if (tid == 0)
timer[bid] = clock();
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
if (tid == 0)
timer[bid + gridDim.x] = clock();
}
#define NUM_BLOCKS 64
#define NUM_BLOCKS 64
#define NUM_THREADS 256
// It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
// the memory. With more than 32 the speed scales linearly.
// Start the main CUDA Sample here
int main(int argc, char **argv) {
printf("CUDA Clock sample\n");
int main(int argc, char **argv)
{
printf("CUDA Clock sample\n");
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
float *dinput = NULL;
float *doutput = NULL;
clock_t *dtimer = NULL;
float *dinput = NULL;
float *doutput = NULL;
clock_t *dtimer = NULL;
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
checkCudaErrors(
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
dinput, doutput, dtimer);
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer));
checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer));
long double avgElapsedClocks = 0;
long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -34,12 +34,11 @@
*/
// System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <nvrtc_helper.h>
#include <stdint.h>
#include <stdio.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
@ -71,64 +70,68 @@
// Start the main CUDA Sample here
int main(int argc, char **argv) {
printf("CUDA Clock sample\n");
int main(int argc, char **argv)
{
printf("CUDA Clock sample\n");
typedef long clock_t;
typedef long clock_t;
clock_t timer[NUM_BLOCKS * 2];
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
char *cubin, *kernel_file;
size_t cubinSize;
char *cubin, *kernel_file;
size_t cubinSize;
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
CUdeviceptr dinput, doutput, dtimer;
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
CUdeviceptr dinput, doutput, dtimer;
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
checkCudaErrors(cuLaunchKernel(
kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x,
cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(dtimer));
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(dtimer));
long double avgElapsedClocks = 0;
long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}

View File

@ -37,38 +37,41 @@
// time it takes to do that for each block. The timing results are stored
// in device memory.
extern "C" __global__ void timedReduction(const float *input, float *output,
clock_t *timer) {
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
if (tid == 0)
timer[bid] = clock();
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
if (tid == 0)
timer[bid + gridDim.x] = clock();
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaGetLastError, cudaSetDevice, cudaG
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -32,128 +32,125 @@
#include <helper_cuda.h>
#include <omp.h>
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
using namespace std;
// a simple kernel that simply increments each array element by b
__global__ void kernelAddConstant(int *g_a, const int b) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b;
__global__ void kernelAddConstant(int *g_a, const int b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b;
}
// a predicate that checks whether each array element is set to its index plus b
int correctResult(int *data, const int n, const int b) {
for (int i = 0; i < n; i++)
if (data[i] != i + b) return 0;
int correctResult(int *data, const int n, const int b)
{
for (int i = 0; i < n; i++)
if (data[i] != i + b)
return 0;
return 1;
return 1;
}
int main(int argc, char *argv[]) {
int num_gpus = 0; // number of CUDA GPUs
int main(int argc, char *argv[])
{
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
printf("%s Starting...\n\n", argv[0]);
/////////////////////////////////////////////////////////////////
// determine the number of CUDA capable GPUs
//
cudaGetDeviceCount(&num_gpus);
/////////////////////////////////////////////////////////////////
// determine the number of CUDA capable GPUs
//
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1) {
printf("no CUDA capable devices were detected\n");
return 1;
}
if (num_gpus < 1) {
printf("no CUDA capable devices were detected\n");
return 1;
}
/////////////////////////////////////////////////////////////////
// display CPU and GPU configuration
//
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
/////////////////////////////////////////////////////////////////
// display CPU and GPU configuration
//
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++) {
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
for (int i = 0; i < num_gpus; i++) {
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("---------------------------\n");
printf("---------------------------\n");
/////////////////////////////////////////////////////////////////
// initialize data
//
unsigned int n = num_gpus * 8192;
unsigned int nbytes = n * sizeof(int);
int *a = 0; // pointer to data on the CPU
int b = 3; // value by which the array is incremented
a = (int *)malloc(nbytes);
/////////////////////////////////////////////////////////////////
// initialize data
//
unsigned int n = num_gpus * 8192;
unsigned int nbytes = n * sizeof(int);
int *a = 0; // pointer to data on the CPU
int b = 3; // value by which the array is incremented
a = (int *)malloc(nbytes);
if (0 == a) {
printf("couldn't allocate CPU memory\n");
return 1;
}
if (0 == a) {
printf("couldn't allocate CPU memory\n");
return 1;
}
for (unsigned int i = 0; i < n; i++) a[i] = i;
for (unsigned int i = 0; i < n; i++)
a[i] = i;
////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices
// each CPU thread controls a different device, processing its
// portion of the data. It's possible to use more CPU threads
// than there are CUDA devices, in which case several CPU
// threads will be allocating resources and launching kernels
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
// Recall that all variables declared inside an "omp parallel" scope are
// local to each CPU thread
//
omp_set_num_threads(
num_gpus); // create as many CPU threads as there are CUDA devices
////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices
// each CPU thread controls a different device, processing its
// portion of the data. It's possible to use more CPU threads
// than there are CUDA devices, in which case several CPU
// threads will be allocating resources and launching kernels
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
// Recall that all variables declared inside an "omp parallel" scope are
// local to each CPU thread
//
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
// are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
checkCudaErrors(cudaSetDevice(
cpu_thread_id %
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
checkCudaErrors(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
num_cpu_threads, gpu_id);
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
checkCudaErrors(
cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
checkCudaErrors(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
int *d_a =
0; // pointer to memory on the device associated with this CPU thread
int *sub_a =
a +
cpu_thread_id * n /
num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
int *d_a = 0; // pointer to memory on the device associated with this CPU thread
int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors(
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
checkCudaErrors(
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_a));
}
printf("---------------------------\n");
checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_a));
}
printf("---------------------------\n");
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
////////////////////////////////////////////////////////////////
// check the result
//
bool bResult = correctResult(a, n, b);
////////////////////////////////////////////////////////////////
// check the result
//
bool bResult = correctResult(a, n, b);
if (a) free(a); // free CPU memory
if (a)
free(a); // free CPU memory
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -9,8 +9,10 @@ find_package(CUDAToolkit REQUIRED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 60 61 70 72 75 80 86 87 89 90 100 101 120)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaMallocHost, cudaFreeHost, cudaMalloc, cudaGetDevicePro
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -25,191 +25,188 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cuda_fp16.h"
#include "helper_cuda.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#define NUM_OF_BLOCKS 128
#include "cuda_fp16.h"
#include "helper_cuda.h"
#define NUM_OF_BLOCKS 128
#define NUM_OF_THREADS 128
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
if (threadIdx.x < 64)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
__syncthreads();
if (threadIdx.x < 32)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
__syncthreads();
if (threadIdx.x < 16)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
__syncthreads();
if (threadIdx.x < 8)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
__syncthreads();
if (threadIdx.x < 4)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
__syncthreads();
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
{
if (threadIdx.x < 64)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
__syncthreads();
if (threadIdx.x < 32)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
__syncthreads();
if (threadIdx.x < 16)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
__syncthreads();
if (threadIdx.x < 8)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
__syncthreads();
if (threadIdx.x < 4)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
__syncthreads();
}
__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
__syncthreads();
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
__syncthreads();
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
__syncthreads();
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
__syncthreads();
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
__syncthreads();
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
__syncthreads();
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
__syncthreads();
__forceinline__ __device__ void reduceInShared_native(half2 *const v)
{
if (threadIdx.x < 64)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
__syncthreads();
if (threadIdx.x < 32)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
__syncthreads();
if (threadIdx.x < 16)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
__syncthreads();
if (threadIdx.x < 8)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
__syncthreads();
if (threadIdx.x < 4)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
__syncthreads();
}
__global__ void scalarProductKernel_intrinsics(half2 const *const a,
half2 const *const b,
float *const results,
size_t const size) {
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
__global__ void
scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
{
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
shArray[threadIdx.x] = __float2half2_rn(0.f);
half2 value = __float2half2_rn(0.f);
shArray[threadIdx.x] = __float2half2_rn(0.f);
half2 value = __float2half2_rn(0.f);
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = __hfma2(a[i], b[i], value);
}
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = __hfma2(a[i], b[i], value);
}
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_intrinsics(shArray);
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_intrinsics(shArray);
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = __low2float(result) + __high2float(result);
results[blockIdx.x] = f_result;
}
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = __low2float(result) + __high2float(result);
results[blockIdx.x] = f_result;
}
}
__global__ void scalarProductKernel_native(half2 const *const a,
half2 const *const b,
float *const results,
size_t const size) {
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
__global__ void
scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
{
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
half2 value(0.f, 0.f);
shArray[threadIdx.x] = value;
half2 value(0.f, 0.f);
shArray[threadIdx.x] = value;
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = a[i] * b[i] + value;
}
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = a[i] * b[i] + value;
}
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_native(shArray);
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_native(shArray);
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = (float)result.y + (float)result.x;
results[blockIdx.x] = f_result;
}
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = (float)result.y + (float)result.x;
results[blockIdx.x] = f_result;
}
}
void generateInput(half2 *a, size_t size) {
for (size_t i = 0; i < size; ++i) {
half2 temp;
temp.x = static_cast<float>(rand() % 4);
temp.y = static_cast<float>(rand() % 2);
a[i] = temp;
}
void generateInput(half2 *a, size_t size)
{
for (size_t i = 0; i < size; ++i) {
half2 temp;
temp.x = static_cast<float>(rand() % 4);
temp.y = static_cast<float>(rand() % 2);
a[i] = temp;
}
}
int main(int argc, char *argv[]) {
srand((unsigned int)time(NULL));
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
int main(int argc, char *argv[])
{
srand((unsigned int)time(NULL));
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
half2 *vec[2];
half2 *devVec[2];
half2 *vec[2];
half2 *devVec[2];
float *results;
float *devResults;
float *results;
float *devResults;
int devID = findCudaDevice(argc, (const char **)argv);
int devID = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp devProp;
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
cudaDeviceProp devProp;
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
printf(
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"higher.\n");
return EXIT_WAIVED;
}
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"higher.\n");
return EXIT_WAIVED;
}
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
}
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
}
checkCudaErrors(
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
checkCudaErrors(
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
for (int i = 0; i < 2; ++i) {
generateInput(vec[i], size);
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
cudaMemcpyHostToDevice));
}
for (int i = 0; i < 2; ++i) {
generateInput(vec[i], size);
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
}
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
devVec[0], devVec[1], devResults, size);
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults,
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
float result_native = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_native += results[i];
}
printf("Result native operators\t: %f \n", result_native);
float result_native = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_native += results[i];
}
printf("Result native operators\t: %f \n", result_native);
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
devVec[0], devVec[1], devResults, size);
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults,
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
float result_intrinsics = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_intrinsics += results[i];
}
printf("Result intrinsics\t: %f \n", result_intrinsics);
float result_intrinsics = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_intrinsics += results[i];
}
printf("Result intrinsics\t: %f \n", result_intrinsics);
printf("&&&& fp16ScalarProduct %s\n",
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
: "FAILED");
printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaFree(devVec[i]));
checkCudaErrors(cudaFreeHost(vec[i]));
}
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaFree(devVec[i]));
checkCudaErrors(cudaFreeHost(vec[i]));
}
checkCudaErrors(cudaFree(devResults));
checkCudaErrors(cudaFreeHost(results));
checkCudaErrors(cudaFree(devResults));
checkCudaErrors(cudaFreeHost(results));
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -2,7 +2,7 @@
## Description
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
## Key Concepts
@ -27,6 +27,6 @@ cudaStreamCreateWithFlags, cudaProfilerStop, cudaMalloc, cudaFree, cudaMallocHos
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -40,314 +40,303 @@
*/
// System includes
#include <stdio.h>
#include <assert.h>
#include <stdio.h>
// CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_functions.h>
/**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
* wA is A's width and wB is B's width
*/
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
float *B, int wA,
int wB) {
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
void ConstantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
void ConstantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
/**
* Run a simple test of matrix multiplication using CUDA
*/
int MatrixMultiply(int argc, char **argv,
int block_size, const dim3 &dimsA,
const dim3 &dimsB) {
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A;
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B;
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
cudaStream_t stream;
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A;
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B;
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
cudaStream_t stream;
// Initialize host memory
const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB);
// Initialize host memory
const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB);
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C;
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C;
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// copy host memory to device
checkCudaErrors(
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// copy host memory to device
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
// Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16) {
MatrixMulCUDA<16>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} else {
MatrixMulCUDA<32>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
printf("done\n");
checkCudaErrors(cudaStreamSynchronize(stream));
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
// Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16) {
MatrixMulCUDA<16>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} else {
MatrixMulCUDA<32>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
static_cast<double>(dimsA.y) *
static_cast<double>(dimsB.x);
double gigaFlops =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
// Copy result from device to host
checkCudaErrors(
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
i, h_C[i], dimsA.x * valB, eps);
correct = false;
else {
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("done\n");
checkCudaErrors(cudaStreamSynchronize(stream));
// Clean up memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
printf(
"\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n");
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
if (correct) {
return EXIT_SUCCESS;
} else {
return EXIT_FAILURE;
}
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
if (block_size == 16) {
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else {
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul =
2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul,
threads.x * threads.y);
// Copy result from device to host
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// Clean up memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
printf("\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n");
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
}
/**
* Program main
*/
int main(int argc, char **argv) {
printf("[Matrix Multiply Using CUDA] - Starting...\n");
int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices" \
" must be equal.\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices"
" must be equal.\n");
exit(EXIT_SUCCESS);
}
exit(EXIT_SUCCESS);
}
// This will pick the best possible CUDA capable device, otherwise
// override the device ID based on input provided at the command line
int dev = findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device, otherwise
// override the device ID based on input provided at the command line
int dev = findCudaDevice(argc, (const char **)argv);
int block_size = 32;
int block_size = 32;
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
dimsB.x, dimsB.y);
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
checkCudaErrors(cudaProfilerStop());
checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
checkCudaErrors(cudaProfilerStop());
exit(matrix_result);
exit(matrix_result);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
@ -38,6 +40,12 @@ target_link_libraries(matrixMulDrv PUBLIC
set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/matrixMul_kernel64.fatbin")
set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/matrixMul_kernel.cu")
# Construct GENCODE_FLAGS explicitly from CUDA architectures
set(GENCODE_FLAGS "")
foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
list(APPEND GENCODE_FLAGS "-gencode=arch=compute_${arch},code=sm_${arch}")
endforeach()
add_custom_command(
OUTPUT ${CUDA_FATBIN_FILE}
COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}

View File

@ -27,6 +27,6 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuDeviceGetName, cuDeviceTotalMem, c
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -30,11 +30,11 @@
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#endif // _MATRIXMUL_H_
#endif // _MATRIXMUL_H_

View File

@ -46,23 +46,23 @@
// includes, system
#include <builtin_types.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, project, CUDA
#include <cstring>
#include <cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_image.h>
#include <helper_string.h>
#include <helper_timer.h>
#include <cstring>
#include <iostream>
#include <string>
#include "matrixMul.h"
@ -71,11 +71,9 @@
void runTest(int argc, char **argv);
void randomInit(float *, int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
unsigned int, unsigned int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
int *blk_size);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
#ifndef FATBIN_FILE
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
////////////////////////////////////////////////////////////////////////////////
// Globals
////////////////////////////////////////////////////////////////////////////////
CUdevice cuDevice;
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t totalGlobalMem;
CUmodule cuModule;
size_t totalGlobalMem;
const char *sSDKsample = "matrixMulDrv (Driver API)";
void constantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
void constantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("[ %s ]\n", sSDKsample);
int main(int argc, char **argv)
{
printf("[ %s ]\n", sSDKsample);
runTest(argc, argv);
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
void runTest(int argc, char **argv)
{
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
initCUDA(argc, argv, &matrixMul, &block_size);
initCUDA(argc, argv, &matrixMul, &block_size);
// set seed for rand()
srand(2006);
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
// initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// allocate device memory
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
// allocate device memory
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
// allocate mem for the result on host side
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method.
dim3 block(block_size, block_size, 1);
dim3 grid(WC / block_size, HC / block_size, 1);
// There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method.
dim3 block(block_size, block_size, 1);
dim3 grid(WC / block_size, HC / block_size, 1);
if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method)
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
} else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
// pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is storing the value of the parameters
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
h_C[i], WA * valB);
correct = false;
if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method)
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(matrixMul,
grid.x,
grid.y,
grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
args,
NULL));
}
}
else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is storing the value of the parameters
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
// clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(cuContext));
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
void *kernel_launch_config[5] = {
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(matrixMul,
grid.x,
grid.y,
grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(cuContext));
}
// Allocates a matrix with random float entries.
void randomInit(float *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = rand() / static_cast<float>(RAND_MAX);
}
}
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
int *blk_size) {
CUfunction cuFunction = 0;
int major = 0, minor = 0;
char deviceName[100];
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n",
(long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
std::string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE);
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
"matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
} else {
block_size /= 2;
void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i) {
data[i] = rand() / static_cast<float>(RAND_MAX);
}
idx++;
}
*pMatrixMul = cuFunction;
*blk_size = block_size;
return 0;
}
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
{
CUfunction cuFunction = 0;
int major = 0, minor = 0;
char deviceName[100];
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
std::string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE);
}
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
}
else {
block_size /= 2;
}
idx++;
}
*pMatrixMul = cuFunction;
*blk_size = block_size;
return 0;
}

View File

@ -42,86 +42,87 @@
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
template <int block_size, typename size_type>
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
size_type wB) {
// Block index
size_type bx = blockIdx.x;
size_type by = blockIdx.y;
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
{
// Block index
size_type bx = blockIdx.x;
size_type by = blockIdx.y;
// Thread index
size_type tx = threadIdx.x;
size_type ty = threadIdx.y;
// Thread index
size_type tx = threadIdx.x;
size_type ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
size_type aBegin = wA * block_size * by;
// Index of the first sub-matrix of A processed by the block
size_type aBegin = wA * block_size * by;
// Index of the last sub-matrix of A processed by the block
size_type aEnd = aBegin + wA - 1;
// Index of the last sub-matrix of A processed by the block
size_type aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
size_type aStep = block_size;
// Step size used to iterate through the sub-matrices of A
size_type aStep = block_size;
// Index of the first sub-matrix of B processed by the block
size_type bBegin = block_size * bx;
// Index of the first sub-matrix of B processed by the block
size_type bBegin = block_size * bx;
// Step size used to iterate through the sub-matrices of B
size_type bStep = block_size * wB;
// Step size used to iterate through the sub-matrices of B
size_type bStep = block_size * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[block_size][block_size];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[block_size][block_size];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[block_size][block_size];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[block_size][block_size];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
for (size_type k = 0; k < block_size; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
size_type c = wB * block_size * by + block_size * bx;
C[c + wB * ty + tx] = Csub;
// Write the block sub-matrix to device memory;
// each thread writes one element
size_type c = wB * block_size * by + block_size * bx;
C[c + wB * ty + tx] = Csub;
}
// C wrappers around our template kernel
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
size_t wA, size_t wB) {
matrixMul<8, size_t>(C, A, B, wA, wB);
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
{
matrixMul<8, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
size_t wA, size_t wB) {
matrixMul<16, size_t>(C, A, B, wA, wB);
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
{
matrixMul<16, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
size_t wA, size_t wB) {
matrixMul<32, size_t>(C, A, B, wA, wB);
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
{
matrixMul<32, size_t>(C, A, B, wA, wB);
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
#endif // #ifndef _MATRIXMUL_KERNEL_H_

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cuMemcpyDtoH, cuDeviceGetName, cuParamSeti, cuModuleLoadDataEx, cuModuleGetFunct
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -15,210 +15,211 @@
// With these flags defined, this source file will dynamically
// load the corresponding functions. Disabled by default.
//#define CUDA_INIT_D3D9
//#define CUDA_INIT_D3D10
//#define CUDA_INIT_D3D11
//#define CUDA_INIT_OPENGL
// #define CUDA_INIT_D3D9
// #define CUDA_INIT_D3D10
// #define CUDA_INIT_D3D11
// #define CUDA_INIT_OPENGL
#include <stdio.h>
#include "cuda_drvapi_dynlink.h"
tcuInit *_cuInit;
tcuDriverGetVersion *cuDriverGetVersion;
tcuDeviceGet *cuDeviceGet;
tcuDeviceGetCount *cuDeviceGetCount;
tcuDeviceGetName *cuDeviceGetName;
tcuDeviceComputeCapability *cuDeviceComputeCapability;
tcuDeviceTotalMem *cuDeviceTotalMem;
tcuDeviceGetProperties *cuDeviceGetProperties;
tcuDeviceGetAttribute *cuDeviceGetAttribute;
tcuGetErrorString *cuGetErrorString;
tcuCtxCreate *cuCtxCreate;
tcuCtxDestroy *cuCtxDestroy;
tcuCtxAttach *cuCtxAttach;
tcuCtxDetach *cuCtxDetach;
tcuCtxPushCurrent *cuCtxPushCurrent;
tcuCtxPopCurrent *cuCtxPopCurrent;
tcuCtxGetCurrent *cuCtxGetCurrent;
tcuCtxSetCurrent *cuCtxSetCurrent;
tcuCtxGetDevice *cuCtxGetDevice;
tcuCtxSynchronize *cuCtxSynchronize;
tcuModuleLoad *cuModuleLoad;
tcuModuleLoadData *cuModuleLoadData;
tcuModuleLoadDataEx *cuModuleLoadDataEx;
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
tcuModuleUnload *cuModuleUnload;
tcuModuleGetFunction *cuModuleGetFunction;
tcuModuleGetGlobal *cuModuleGetGlobal;
tcuModuleGetTexRef *cuModuleGetTexRef;
tcuModuleGetSurfRef *cuModuleGetSurfRef;
tcuMemGetInfo *cuMemGetInfo;
tcuMemAlloc *cuMemAlloc;
tcuMemAllocPitch *cuMemAllocPitch;
tcuMemFree *cuMemFree;
tcuMemGetAddressRange *cuMemGetAddressRange;
tcuMemAllocHost *cuMemAllocHost;
tcuMemFreeHost *cuMemFreeHost;
tcuMemHostAlloc *cuMemHostAlloc;
tcuMemHostGetFlags *cuMemHostGetFlags;
#include <stdio.h>
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
tcuIpcGetEventHandle *cuIpcGetEventHandle;
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
tcuIpcGetMemHandle *cuIpcGetMemHandle;
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
tcuInit *_cuInit;
tcuDriverGetVersion *cuDriverGetVersion;
tcuDeviceGet *cuDeviceGet;
tcuDeviceGetCount *cuDeviceGetCount;
tcuDeviceGetName *cuDeviceGetName;
tcuDeviceComputeCapability *cuDeviceComputeCapability;
tcuDeviceTotalMem *cuDeviceTotalMem;
tcuDeviceGetProperties *cuDeviceGetProperties;
tcuDeviceGetAttribute *cuDeviceGetAttribute;
tcuGetErrorString *cuGetErrorString;
tcuCtxCreate *cuCtxCreate;
tcuCtxDestroy *cuCtxDestroy;
tcuCtxAttach *cuCtxAttach;
tcuCtxDetach *cuCtxDetach;
tcuCtxPushCurrent *cuCtxPushCurrent;
tcuCtxPopCurrent *cuCtxPopCurrent;
tcuCtxGetCurrent *cuCtxGetCurrent;
tcuCtxSetCurrent *cuCtxSetCurrent;
tcuCtxGetDevice *cuCtxGetDevice;
tcuCtxSynchronize *cuCtxSynchronize;
tcuModuleLoad *cuModuleLoad;
tcuModuleLoadData *cuModuleLoadData;
tcuModuleLoadDataEx *cuModuleLoadDataEx;
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
tcuModuleUnload *cuModuleUnload;
tcuModuleGetFunction *cuModuleGetFunction;
tcuModuleGetGlobal *cuModuleGetGlobal;
tcuModuleGetTexRef *cuModuleGetTexRef;
tcuModuleGetSurfRef *cuModuleGetSurfRef;
tcuMemGetInfo *cuMemGetInfo;
tcuMemAlloc *cuMemAlloc;
tcuMemAllocPitch *cuMemAllocPitch;
tcuMemFree *cuMemFree;
tcuMemGetAddressRange *cuMemGetAddressRange;
tcuMemAllocHost *cuMemAllocHost;
tcuMemFreeHost *cuMemFreeHost;
tcuMemHostAlloc *cuMemHostAlloc;
tcuMemHostGetFlags *cuMemHostGetFlags;
tcuMemHostRegister *cuMemHostRegister;
tcuMemHostUnregister *cuMemHostUnregister;
tcuMemcpyHtoD *cuMemcpyHtoD;
tcuMemcpyDtoH *cuMemcpyDtoH;
tcuMemcpyDtoD *cuMemcpyDtoD;
tcuMemcpyDtoA *cuMemcpyDtoA;
tcuMemcpyAtoD *cuMemcpyAtoD;
tcuMemcpyHtoA *cuMemcpyHtoA;
tcuMemcpyAtoH *cuMemcpyAtoH;
tcuMemcpyAtoA *cuMemcpyAtoA;
tcuMemcpy2D *cuMemcpy2D;
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
tcuMemcpy3D *cuMemcpy3D;
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
tcuMemcpy2DAsync *cuMemcpy2DAsync;
tcuMemcpy3DAsync *cuMemcpy3DAsync;
tcuMemcpy *cuMemcpy;
tcuMemcpyPeer *cuMemcpyPeer;
tcuMemsetD8 *cuMemsetD8;
tcuMemsetD16 *cuMemsetD16;
tcuMemsetD32 *cuMemsetD32;
tcuMemsetD2D8 *cuMemsetD2D8;
tcuMemsetD2D16 *cuMemsetD2D16;
tcuMemsetD2D32 *cuMemsetD2D32;
tcuFuncSetBlockShape *cuFuncSetBlockShape;
tcuFuncSetSharedSize *cuFuncSetSharedSize;
tcuFuncGetAttribute *cuFuncGetAttribute;
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
tcuLaunchKernel *cuLaunchKernel;
tcuArrayCreate *cuArrayCreate;
tcuArrayGetDescriptor *cuArrayGetDescriptor;
tcuArrayDestroy *cuArrayDestroy;
tcuArray3DCreate *cuArray3DCreate;
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
tcuTexRefCreate *cuTexRefCreate;
tcuTexRefDestroy *cuTexRefDestroy;
tcuTexRefSetArray *cuTexRefSetArray;
tcuTexRefSetAddress *cuTexRefSetAddress;
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
tcuTexRefSetFormat *cuTexRefSetFormat;
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
tcuTexRefSetFlags *cuTexRefSetFlags;
tcuTexRefGetAddress *cuTexRefGetAddress;
tcuTexRefGetArray *cuTexRefGetArray;
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
tcuTexRefGetFormat *cuTexRefGetFormat;
tcuTexRefGetFlags *cuTexRefGetFlags;
tcuSurfRefSetArray *cuSurfRefSetArray;
tcuSurfRefGetArray *cuSurfRefGetArray;
tcuParamSetSize *cuParamSetSize;
tcuParamSeti *cuParamSeti;
tcuParamSetf *cuParamSetf;
tcuParamSetv *cuParamSetv;
tcuParamSetTexRef *cuParamSetTexRef;
tcuLaunch *cuLaunch;
tcuLaunchGrid *cuLaunchGrid;
tcuLaunchGridAsync *cuLaunchGridAsync;
tcuEventCreate *cuEventCreate;
tcuEventRecord *cuEventRecord;
tcuEventQuery *cuEventQuery;
tcuEventSynchronize *cuEventSynchronize;
tcuEventDestroy *cuEventDestroy;
tcuEventElapsedTime *cuEventElapsedTime;
tcuStreamCreate *cuStreamCreate;
tcuStreamWaitEvent *cuStreamWaitEvent;
tcuStreamAddCallback *cuStreamAddCallback;
tcuStreamQuery *cuStreamQuery;
tcuStreamSynchronize *cuStreamSynchronize;
tcuStreamDestroy *cuStreamDestroy;
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
tcuGraphicsMapResources *cuGraphicsMapResources;
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
tcuGetExportTable *cuGetExportTable;
tcuCtxSetLimit *cuCtxSetLimit;
tcuCtxGetLimit *cuCtxGetLimit;
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
tcuCtxGetApiVersion *cuCtxGetApiVersion;
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
tcuIpcGetEventHandle *cuIpcGetEventHandle;
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
tcuIpcGetMemHandle *cuIpcGetMemHandle;
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
tcuMemHostRegister *cuMemHostRegister;
tcuMemHostUnregister *cuMemHostUnregister;
tcuMemcpyHtoD *cuMemcpyHtoD;
tcuMemcpyDtoH *cuMemcpyDtoH;
tcuMemcpyDtoD *cuMemcpyDtoD;
tcuMemcpyDtoA *cuMemcpyDtoA;
tcuMemcpyAtoD *cuMemcpyAtoD;
tcuMemcpyHtoA *cuMemcpyHtoA;
tcuMemcpyAtoH *cuMemcpyAtoH;
tcuMemcpyAtoA *cuMemcpyAtoA;
tcuMemcpy2D *cuMemcpy2D;
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
tcuMemcpy3D *cuMemcpy3D;
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
tcuMemcpy2DAsync *cuMemcpy2DAsync;
tcuMemcpy3DAsync *cuMemcpy3DAsync;
tcuMemcpy *cuMemcpy;
tcuMemcpyPeer *cuMemcpyPeer;
tcuMemsetD8 *cuMemsetD8;
tcuMemsetD16 *cuMemsetD16;
tcuMemsetD32 *cuMemsetD32;
tcuMemsetD2D8 *cuMemsetD2D8;
tcuMemsetD2D16 *cuMemsetD2D16;
tcuMemsetD2D32 *cuMemsetD2D32;
tcuFuncSetBlockShape *cuFuncSetBlockShape;
tcuFuncSetSharedSize *cuFuncSetSharedSize;
tcuFuncGetAttribute *cuFuncGetAttribute;
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
tcuLaunchKernel *cuLaunchKernel;
tcuArrayCreate *cuArrayCreate;
tcuArrayGetDescriptor *cuArrayGetDescriptor;
tcuArrayDestroy *cuArrayDestroy;
tcuArray3DCreate *cuArray3DCreate;
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
tcuTexRefCreate *cuTexRefCreate;
tcuTexRefDestroy *cuTexRefDestroy;
tcuTexRefSetArray *cuTexRefSetArray;
tcuTexRefSetAddress *cuTexRefSetAddress;
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
tcuTexRefSetFormat *cuTexRefSetFormat;
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
tcuTexRefSetFlags *cuTexRefSetFlags;
tcuTexRefGetAddress *cuTexRefGetAddress;
tcuTexRefGetArray *cuTexRefGetArray;
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
tcuTexRefGetFormat *cuTexRefGetFormat;
tcuTexRefGetFlags *cuTexRefGetFlags;
tcuSurfRefSetArray *cuSurfRefSetArray;
tcuSurfRefGetArray *cuSurfRefGetArray;
tcuParamSetSize *cuParamSetSize;
tcuParamSeti *cuParamSeti;
tcuParamSetf *cuParamSetf;
tcuParamSetv *cuParamSetv;
tcuParamSetTexRef *cuParamSetTexRef;
tcuLaunch *cuLaunch;
tcuLaunchGrid *cuLaunchGrid;
tcuLaunchGridAsync *cuLaunchGridAsync;
tcuEventCreate *cuEventCreate;
tcuEventRecord *cuEventRecord;
tcuEventQuery *cuEventQuery;
tcuEventSynchronize *cuEventSynchronize;
tcuEventDestroy *cuEventDestroy;
tcuEventElapsedTime *cuEventElapsedTime;
tcuStreamCreate *cuStreamCreate;
tcuStreamWaitEvent *cuStreamWaitEvent;
tcuStreamAddCallback *cuStreamAddCallback;
tcuStreamQuery *cuStreamQuery;
tcuStreamSynchronize *cuStreamSynchronize;
tcuStreamDestroy *cuStreamDestroy;
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
tcuGraphicsMapResources *cuGraphicsMapResources;
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
tcuGetExportTable *cuGetExportTable;
tcuCtxSetLimit *cuCtxSetLimit;
tcuCtxGetLimit *cuCtxGetLimit;
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
tcuCtxGetApiVersion *cuCtxGetApiVersion;
tcuProfilerStop *cuProfilerStop;
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
tcuProfilerStop *cuProfilerStop;
#ifdef CUDA_INIT_D3D9
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
// are deprecated; please use the ones below
tcuD3D9Begin *cuD3D9Begin;
tcuD3D9End *cuD3DEnd;
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
tcuD3D9Begin *cuD3D9Begin;
tcuD3D9End *cuD3DEnd;
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
// D3D9/CUDA interop (CUDA 2.x compatible)
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
tcuD3D9RegisterResource *cuD3D9RegisterResource;
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
tcuD3D9MapResources *cuD3D9MapResources;
tcuD3D9UnmapResources *cuD3D9UnmapResources;
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
tcuD3D9RegisterResource *cuD3D9RegisterResource;
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
tcuD3D9MapResources *cuD3D9MapResources;
tcuD3D9UnmapResources *cuD3D9UnmapResources;
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
// D3D9/CUDA interop (CUDA 2.0+)
tcuD3D9GetDevice *cuD3D9GetDevice;
tcuD3D9CtxCreate *cuD3D9CtxCreate;
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
tcuD3D9GetDevice *cuD3D9GetDevice;
tcuD3D9CtxCreate *cuD3D9CtxCreate;
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
#endif
#ifdef CUDA_INIT_D3D10
// D3D10/CUDA interop (CUDA 3.0+)
tcuD3D10GetDevice *cuD3D10GetDevice;
tcuD3D10CtxCreate *cuD3D10CtxCreate;
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
tcuD3D10GetDevice *cuD3D10GetDevice;
tcuD3D10CtxCreate *cuD3D10CtxCreate;
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
#endif
#ifdef CUDA_INIT_D3D11
// D3D11/CUDA interop (CUDA 3.0+)
tcuD3D11GetDevice *cuD3D11GetDevice;
tcuD3D11CtxCreate *cuD3D11CtxCreate;
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
tcuD3D11GetDevice *cuD3D11GetDevice;
tcuD3D11CtxCreate *cuD3D11CtxCreate;
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
#endif
// GL/CUDA interop
#ifdef CUDA_INIT_OPENGL
tcuGLCtxCreate *cuGLCtxCreate;
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
tcuGLCtxCreate *cuGLCtxCreate;
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
tcuWGLGetDevice *cuWGLGetDevice;
tcuWGLGetDevice *cuWGLGetDevice;
#endif
#endif
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{
*pInstance = LoadLibrary(__CudaLibName);
if (*pInstance == NULL)
{
if (*pInstance == NULL) {
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN;
}
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h>
#if defined(__APPLE__) || defined(__MACOSX)
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
#elif defined(__ANDROID__)
#if defined (__aarch64__)
#if defined(__aarch64__)
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
#elif defined(__arm__)
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
if (*pInstance == NULL)
{
if (*pInstance == NULL) {
printf("dlopen \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN;
}
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#else
#error unsupported platform
#endif
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while(0)
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while (0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
#define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
{
CUDADRIVER CudaDrvLib;
int driverVer = 1000;
int driverVer = 1000;
CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
// available since 2.2. if not present, version 1.0 is assumed
GET_PROC_OPTIONAL(cuDriverGetVersion);
if (cuDriverGetVersion)
{
if (cuDriverGetVersion) {
CHECKED_CALL(cuDriverGetVersion(&driverVer));
}
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuStreamDestroy);
// These are CUDA 5.0 new functions
if (driverVer >= 5000)
{
if (driverVer >= 5000) {
GET_PROC(cuMipmappedArrayCreate);
GET_PROC(cuMipmappedArrayDestroy);
GET_PROC(cuMipmappedArrayGetLevel);
}
// These are CUDA 4.2 new functions
if (driverVer >= 4020)
{
if (driverVer >= 4020) {
GET_PROC(cuFuncSetSharedMemConfig);
GET_PROC(cuCtxGetSharedMemConfig);
GET_PROC(cuCtxSetSharedMemConfig);
}
// These are CUDA 4.1 new functions
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
{
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
GET_PROC(cuDeviceGetByPCIBusId);
GET_PROC(cuDeviceGetPCIBusId);
GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
}
// These could be _v2 interfaces
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
{
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
GET_PROC_V2(cuCtxDestroy);
GET_PROC_V2(cuCtxPopCurrent);
GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuEventDestroy);
}
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
{
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
GET_PROC_V2(cuDeviceTotalMem);
GET_PROC_V2(cuCtxCreate);
GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuTexRefSetAddress);
GET_PROC_V2(cuTexRefGetAddress);
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
{
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
GET_PROC_V3(cuTexRefSetAddress2D);
}
else
{
else {
GET_PROC_V2(cuTexRefSetAddress2D);
}
}
else
{
else {
// versions earlier than 3020
GET_PROC(cuDeviceTotalMem);
GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
}
// The following functions are specific to CUDA versions
if (driverVer >= 4000)
{
if (driverVer >= 4000) {
GET_PROC(cuCtxSetCurrent);
GET_PROC(cuCtxGetCurrent);
GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuProfilerStop);
}
if (driverVer >= 3010)
{
if (driverVer >= 3010) {
GET_PROC(cuModuleGetSurfRef);
GET_PROC(cuSurfRefSetArray);
GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuCtxGetLimit);
}
if (driverVer >= 3000)
{
if (driverVer >= 3000) {
GET_PROC(cuMemcpyDtoDAsync);
GET_PROC(cuFuncSetCacheConfig);
#ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGraphicsUnregisterResource);
GET_PROC(cuGraphicsSubResourceGetMappedArray);
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
{
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
}
else
{
else {
GET_PROC(cuGraphicsResourceGetMappedPointer);
}
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGetExportTable);
}
if (driverVer >= 2030)
{
if (driverVer >= 2030) {
GET_PROC(cuMemHostGetFlags);
#ifdef CUDA_INIT_D3D10
GET_PROC(cuD3D10GetDevice);
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
#endif
}
if (driverVer >= 2010)
{
if (driverVer >= 2010) {
GET_PROC(cuModuleLoadDataEx);
GET_PROC(cuModuleLoadFatBinary);
#ifdef CUDA_INIT_OPENGL
GET_PROC(cuGLCtxCreate);
GET_PROC(cuGraphicsGLRegisterBuffer);
GET_PROC(cuGraphicsGLRegisterImage);
# ifdef WIN32
#ifdef WIN32
GET_PROC(cuWGLGetDevice);
# endif
#endif
#endif
#ifdef CUDA_INIT_D3D9
GET_PROC(cuD3D9GetDevice);

View File

@ -14,21 +14,17 @@
#ifndef HELPER_CUDA_DRVAPI_H
#define HELPER_CUDA_DRVAPI_H
#include <helper_string.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <helper_string.h>
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
#ifndef HELPER_CUDA_DRVAPI_H
inline int ftoi(float value) {
return (value >= 0 ? static_cast<int>(value + 0.5)
: static_cast<int>(value - 0.5));
}
inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
#endif
#ifndef EXIT_WAIVED
@ -47,311 +43,302 @@ inline int ftoi(float value) {
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *errorStr = NULL;
cuGetErrorString(err, &errorStr);
fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
"line %i.\n",
err, errorStr, file, line);
exit(EXIT_FAILURE);
}
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
{
if (CUDA_SUCCESS != err) {
const char *errorStr = NULL;
cuGetErrorString(err, &errorStr);
fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
"line %i.\n",
err,
errorStr,
file,
line);
exit(EXIT_FAILURE);
}
}
#endif
// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
int device) {
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
}
#endif
// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2CoresDRV(int major, int minor) {
// Defines for GPU Architecture types (using the SM version to determine the #
// of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
// minor version
int Cores;
} sSMtoCores;
inline int _ConvertSMVer2CoresDRV(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the #
// of cores per SM
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
// minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{0x90, 128},
{-1, -1}};
sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{0x90, 128},
{-1, -1}};
int index = 0;
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
index++;
}
// If we don't find the values, we default use the previous one to run
// properly
printf(
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
// If we don't find the values, we default use the previous one to run
// properly
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
major,
minor,
nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
}
// end of GPU Architecture definitions
// end of GPU Architecture definitions
#ifdef __cuda_cuda_h__
// General GPU Device CUDA Initialization
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
int cuDevice = 0;
int deviceCount = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
{
int cuDevice = 0;
int deviceCount = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
checkCudaErrors(cuDeviceGetCount(&deviceCount));
checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0) {
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
if (deviceCount == 0) {
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
int dev = 0;
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
int dev = 0;
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
if (dev < 0) {
dev = 0;
}
if (dev < 0) {
dev = 0;
}
if (dev > deviceCount - 1) {
fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
deviceCount);
fprintf(stderr,
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
dev);
fprintf(stderr, "\n");
return -dev;
}
if (dev > deviceCount - 1) {
fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
fprintf(stderr, "\n");
return -dev;
}
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
char name[100];
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
char name[100];
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
fprintf(stderr,
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
"threads can use this CUDA Device.\n");
return -1;
}
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
fprintf(stderr,
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
"threads can use this CUDA Device.\n");
return -1;
}
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
}
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
}
return dev;
return dev;
}
// This function returns the best GPU based on performance
inline int gpuGetMaxGflopsDeviceIdDRV() {
CUdevice current_device = 0;
CUdevice max_perf_device = 0;
int device_count = 0;
int sm_per_multiproc = 0;
unsigned long long max_compute_perf = 0;
int major = 0;
int minor = 0;
int multiProcessorCount;
int clockRate;
int devices_prohibited = 0;
inline int gpuGetMaxGflopsDeviceIdDRV()
{
CUdevice current_device = 0;
CUdevice max_perf_device = 0;
int device_count = 0;
int sm_per_multiproc = 0;
unsigned long long max_compute_perf = 0;
int major = 0;
int minor = 0;
int multiProcessorCount;
int clockRate;
int devices_prohibited = 0;
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
// Find the best CUDA capable GPU device
current_device = 0;
while (current_device < device_count) {
checkCudaErrors(cuDeviceGetAttribute(
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1;
} else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
}
unsigned long long compute_perf =
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
clockRate);
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
} else {
devices_prohibited++;
if (device_count == 0) {
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
++current_device;
}
// Find the best CUDA capable GPU device
current_device = 0;
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
"prohibited.\n");
exit(EXIT_FAILURE);
}
while (current_device < device_count) {
checkCudaErrors(
cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
return max_perf_device;
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1;
}
else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
}
unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
}
else {
devices_prohibited++;
}
++current_device;
}
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
"prohibited.\n");
exit(EXIT_FAILURE);
}
return max_perf_device;
}
// General initialization call to pick the best CUDA Device
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
CUdevice cuDevice;
int devID = 0;
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
{
CUdevice cuDevice;
int devID = 0;
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
devID = gpuDeviceInitDRV(argc, argv);
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
devID = gpuDeviceInitDRV(argc, argv);
if (devID < 0) {
printf("exiting...\n");
exit(EXIT_SUCCESS);
if (devID < 0) {
printf("exiting...\n");
exit(EXIT_SUCCESS);
}
}
else {
// Otherwise pick the device with highest Gflops/s
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
cuDeviceGetName(name, 100, cuDevice);
printf("> Using CUDA Device [%d]: %s\n", devID, name);
}
} else {
// Otherwise pick the device with highest Gflops/s
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
cuDeviceGetName(name, 100, cuDevice);
printf("> Using CUDA Device [%d]: %s\n", devID, name);
}
cuDeviceGet(&cuDevice, devID);
cuDeviceGet(&cuDevice, devID);
return cuDevice;
return cuDevice;
}
inline CUdevice findIntegratedGPUDrv() {
CUdevice current_device = 0;
int device_count = 0;
int devices_prohibited = 0;
int isIntegrated;
inline CUdevice findIntegratedGPUDrv()
{
CUdevice current_device = 0;
int device_count = 0;
int devices_prohibited = 0;
int isIntegrated;
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) {
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
// Find the integrated GPU which is compute capable
while (current_device < device_count) {
int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute(
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
// If GPU is integrated and is not running on Compute Mode prohibited use
// that
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
current_device, deviceName, major, minor);
return current_device;
} else {
devices_prohibited++;
if (device_count == 0) {
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
current_device++;
}
// Find the integrated GPU which is compute capable
while (current_device < device_count) {
int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
if (devices_prohibited == device_count) {
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
exit(EXIT_FAILURE);
}
// If GPU is integrated and is not running on Compute Mode prohibited use
// that
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
return -1;
return current_device;
}
else {
devices_prohibited++;
}
current_device++;
}
if (devices_prohibited == device_count) {
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
exit(EXIT_FAILURE);
}
return -1;
}
// General check for CUDA GPU SM Capabilities
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
int devID) {
CUdevice cuDevice;
char name[256];
int major = 0, minor = 0;
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
{
CUdevice cuDevice;
char name[256];
int major = 0, minor = 0;
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
if ((major > major_version) ||
(major == major_version && minor >= minor_version)) {
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
major, minor);
return true;
} else {
printf(
"No GPU device was found that can support CUDA compute capability "
"%d.%d.\n",
major_version, minor_version);
return false;
}
if ((major > major_version) || (major == major_version && minor >= minor_version)) {
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
return true;
}
else {
printf("No GPU device was found that can support CUDA compute capability "
"%d.%d.\n",
major_version,
minor_version);
return false;
}
}
#endif
// end of CUDA Helper Functions
#endif // HELPER_CUDA_DRVAPI_H
// end of CUDA Helper Functions
#endif // HELPER_CUDA_DRVAPI_H

View File

@ -34,8 +34,8 @@
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#endif // _MATRIXMUL_H_

View File

@ -43,10 +43,10 @@
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, CUDA
#include "cuda_drvapi_dynlink.h"
@ -60,7 +60,7 @@
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
#if defined _MSC_VER
#pragma warning (disable : 4312)
#pragma warning(disable : 4312)
#endif
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
// Globals
////////////////////////////////////////////////////////////////////////////////
CUcontext g_cuContext;
bool noprompt = false;
bool noprompt = false;
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
////////////////////////////////////////////////////////////////////////////////
void randomInit(float *data, size_t size)
{
for (size_t i = 0; i < size; ++i)
{
for (size_t i = 0; i < size; ++i) {
data[i] = rand() / (float)RAND_MAX;
}
}
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
////////////////////////////////////////////////////////////////////////////////
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
{
CUresult status;
CUdevice cuDevice;
CUmodule cuModule;
CUresult status;
CUdevice cuDevice;
CUmodule cuModule;
CUfunction cuFunction;
int major, minor, block_size, devID = 0;
char deviceName[256];
int major, minor, block_size, devID = 0;
char deviceName[256];
// link to cuda driver dynamically
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
// This assumes that the user is attempting to specify a explicit device -device=n
if (argc > 1)
{
if (argc > 1) {
bool bFound = false;
for (int param=0; param < argc; param++)
{
if (!strncmp(argv[param], "-device", 7))
{
int i=(int)strlen(argv[1]);
for (int param = 0; param < argc; param++) {
if (!strncmp(argv[param], "-device", 7)) {
int i = (int)strlen(argv[1]);
while (argv[1][i] != '=')
{
while (argv[1][i] != '=') {
i--;
}
devID = atoi(&argv[1][++i]);
devID = atoi(&argv[1][++i]);
bFound = true;
}
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
int deviceCount = 0;
checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0)
{
if (deviceCount == 0) {
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
exit(EXIT_SUCCESS);
}
if (devID < 0) devID = 0;
if (devID < 0)
devID = 0;
if (devID > deviceCount -1)
{
if (devID > deviceCount - 1) {
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
status = CUDA_ERROR_NOT_FOUND;
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
block_size = 32;
block_size = 32;
*block_size_out = block_size;
// create context for picked device
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
if (CUDA_SUCCESS != status)
{
if (CUDA_SUCCESS != status) {
cuCtxDestroy(g_cuContext);
exit(EXIT_SUCCESS);
}
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
{
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
jitOptVals[2] = (void *)(size_t)jitRegCount;
// compile with set parameters
printf("> Compiling CUDA module\n");
#if defined(_WIN64) || defined(__LP64__)
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#else
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#endif
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
delete [] jitOptions;
delete [] jitOptVals;
delete [] jitLogBuffer;
delete[] jitOptions;
delete[] jitOptVals;
delete[] jitLogBuffer;
}
if (CUDA_SUCCESS != status)
{
if (CUDA_SUCCESS != status) {
printf("Error while compiling PTX\n");
cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE);
}
// retrieve CUDA function from the compiled module
status = cuModuleGetFunction(&cuFunction, cuModule,
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
status = cuModuleGetFunction(
&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
if (CUDA_SUCCESS != status)
{
if (CUDA_SUCCESS != status) {
cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE);
}
@ -233,21 +226,21 @@ int main(int argc, char **argv)
printf("[ %s ]\n", sSDKsample);
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
CUfunction matrixMul = NULL;
int block_size = 0;
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
size_t size_A = WA * HA;
size_t mem_size_A = sizeof(float) * size_A;
size_t size_B = WB * HB;
size_t mem_size_B = sizeof(float) * size_B;
size_t size_A = WA * HA;
size_t mem_size_A = sizeof(float) * size_A;
size_t size_B = WB * HB;
size_t mem_size_B = sizeof(float) * size_B;
float *h_A = (float *) malloc(mem_size_A);
float *h_B = (float *) malloc(mem_size_B);
float *h_A = (float *)malloc(mem_size_A);
float *h_B = (float *)malloc(mem_size_B);
// initialize host memory
randomInit(h_A, size_A);
@ -264,26 +257,24 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side
float *h_C = (float *) malloc(mem_size_C);
float *h_C = (float *)malloc(mem_size_C);
#if __CUDA_API_VERSION >= 4000
{
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
block_size , block_size , 1,
0,
NULL, args, NULL));
checkCudaErrors(cuLaunchKernel(
matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
}
#else // __CUDA_API_VERSION <= 3020
{
@ -312,7 +303,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuParamSetSize(matrixMul, offset));
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
// set execution configuration for the CUDA kernel
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
@ -322,19 +313,18 @@ int main(int argc, char **argv)
checkCudaErrors(cuCtxSynchronize());
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
// compute reference solution
float *reference = (float *) malloc(mem_size_C);
float *reference = (float *)malloc(mem_size_C);
computeGold(reference, h_A, h_B, HA, WA, WB);
// check result
float diff=0.0f;
float diff = 0.0f;
for (unsigned int i=0; i<size_C; i++)
{
for (unsigned int i = 0; i < size_C; i++) {
float tmp = reference[i] - h_C[i];
diff += tmp*tmp;
diff += tmp * tmp;
}
int res = (diff / (float)size_C < 1e-6f);
@ -349,7 +339,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(g_cuContext));
printf("Test run %s\n", (1==res) ? "success!" : "failed!");
printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -28,8 +28,7 @@
////////////////////////////////////////////////////////////////////////////////
// export C interface
extern "C"
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
//! @param hA height of matrix A
//! @param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j)
{
for (unsigned int j = 0; j < wB; ++j) {
double sum = 0;
for (unsigned int k = 0; k < wA; ++k)
{
for (unsigned int k = 0; k < wA; ++k) {
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_32_ptxdump_h__
#if defined __cplusplus
extern "C" {
extern "C"
{
#endif
extern unsigned char matrixMul_kernel_32_ptxdump[25784];

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_64_ptxdump_h__
#if defined __cplusplus
extern "C" {
extern "C"
{
#endif
extern unsigned char matrixMul_kernel_64_ptxdump[26489];

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -2,7 +2,7 @@
## Description
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
## Key Concepts
@ -30,7 +30,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemF
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -42,207 +42,208 @@
*/
// System includes
#include <stdio.h>
#include <assert.h>
#include <stdio.h>
// CUDA runtime
#include <cuda_runtime.h>
#include "nvrtc_helper.h"
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
void constantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
void constantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
/**
* Run a simple test of matrix multiplication using CUDA
*/
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
dim3 &dimsB) {
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// Allocate device memory
CUdeviceptr d_A, d_B, d_C;
// Allocate device memory
CUdeviceptr d_A, d_B, d_C;
char *cubin, *kernel_file;
size_t cubinSize;
char *cubin, *kernel_file;
size_t cubinSize;
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUmodule module = loadCUBIN(cubin, argc, argv);
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *)malloc(mem_size_C);
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *)malloc(mem_size_C);
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
CUfunction kernel_addr;
if (block_size == 16) {
checkCudaErrors(
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
} else {
checkCudaErrors(
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
(void *)&dimsB.x};
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
checkCudaErrors(
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
threads.x, threads.y, threads.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
}
// Copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
h_C[i], dimsA.x * valB, eps);
correct = false;
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// Clean up memory
free(h_A);
free(h_B);
free(h_C);
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
if (correct) {
return EXIT_SUCCESS;
} else {
return EXIT_FAILURE;
}
CUfunction kernel_addr;
if (block_size == 16) {
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
}
else {
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
checkCudaErrors(cuLaunchKernel(kernel_addr,
grid.x,
grid.y,
grid.z, /* grid dim */
threads.x,
threads.y,
threads.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
}
// Copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// Clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
}
/**
* Program main
*/
int main(int argc, char **argv) {
printf("[Matrix Multiply Using CUDA] - Starting...\n");
int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
exit(EXIT_SUCCESS);
}
exit(EXIT_SUCCESS);
}
int block_size = 32;
int block_size = 32;
// original:
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// original:
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// reduce sizes to avoid running out of memory
// dim3 dimsA(32,32, 1);
// dim3 dimsB(32,32,1);
// reduce sizes to avoid running out of memory
// dim3 dimsA(32,32, 1);
// dim3 dimsB(32,32,1);
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
dimsB.y);
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
exit(matrix_result);
exit(matrix_result);
}

View File

@ -48,84 +48,83 @@
#include <cooperative_groups.h>
template <int BLOCK_SIZE>
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
// Handle to thread block group
cooperative_groups::thread_block cta =
cooperative_groups::this_thread_block();
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
{
// Handle to thread block group
cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
cooperative_groups::sync(cta);
// Synchronize to make sure the matrices are loaded
cooperative_groups::sync(cta);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
cooperative_groups::sync(cta);
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
cooperative_groups::sync(cta);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
int wA, int wB) {
matrixMulCUDA<16>(C, A, B, wA, wB);
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
{
matrixMulCUDA<16>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
int wA, int wB) {
matrixMulCUDA<32>(C, A, B, wA, wB);
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
{
matrixMulCUDA<32>(C, A, B, wA, wB);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -28,252 +28,254 @@
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <assert.h>
#include <helper_cuda.h>
#include "mergeSort_common.h"
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
uint &valB, uint arrowDir) {
uint t;
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
{
uint t;
if ((keyA > keyB) == arrowDir) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
}
if ((keyA > keyB) == arrowDir) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
}
}
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint arrayLength, uint sortDir) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// Shared memory storage for one or more short vectors
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
__global__ void
bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// Shared memory storage for one or more short vectors
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
// Offset to the beginning of subbatch and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
// Offset to the beginning of subbatch and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint size = 2; size < arrayLength; size <<= 1) {
// Bitonic merge
uint dir = (threadIdx.x & (size / 2)) != 0;
for (uint size = 2; size < arrayLength; size <<= 1) {
// Bitonic merge
uint dir = (threadIdx.x & (size / 2)) != 0;
for (uint stride = size / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
s_val[pos + stride], dir);
for (uint stride = size / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
}
}
}
// ddd == sortDir for the last bitonic merge step
{
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
s_val[pos + stride], sortDir);
// ddd == sortDir for the last bitonic merge step
{
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
}
}
}
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
// Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L) {
if (!L) {
*log2L = 0;
return 0;
} else {
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
;
extern "C" uint factorRadix2(uint *log2L, uint L)
{
if (!L) {
*log2L = 0;
return 0;
}
else {
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
;
return L;
}
return L;
}
}
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint batchSize, uint arrayLength,
uint sortDir) {
// Nothing to sort
if (arrayLength < 2) {
return;
}
extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir)
{
// Nothing to sort
if (arrayLength < 2) {
return;
}
// Only power-of-two array lengths are supported by this implementation
uint log2L;
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
assert(factorizationRemainder == 1);
// Only power-of-two array lengths are supported by this implementation
uint log2L;
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
assert(factorizationRemainder == 1);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
assert(arrayLength <= SHARED_SIZE_LIMIT);
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
assert(arrayLength <= SHARED_SIZE_LIMIT);
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
bitonicSortSharedKernel<<<blockCount, threadCount>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals
////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
static inline __host__ __device__ uint getSampleCount(uint dividend) {
return iDivUp(dividend, SAMPLE_STRIDE);
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
template <uint sortDir>
static inline __device__ void
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
{
uint t;
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
|| ((arrowDir != sortDir) && (flagB == 1))) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
t = flagA;
flagA = flagB;
flagB = t;
}
}
template <uint sortDir>
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
uint &flagA, uint &keyB,
uint &valB, uint &flagB,
uint arrowDir) {
uint t;
__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
((arrowDir == sortDir) && (flagA == 1)) ||
((arrowDir != sortDir) && (flagB == 1))) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
t = flagA;
flagA = flagB;
flagB = t;
}
}
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
template <uint sortDir>
__global__ void bitonicMergeElementaryIntervalsKernel(
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
// Set up threadblock-wide parameters
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = stride / SAMPLE_STRIDE;
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
// Set up threadblock-wide parameters
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
startDst = startSrcA + startSrcB;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = stride / SAMPLE_STRIDE;
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
}
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
startDst = startSrcA + startSrcB;
s_inf[threadIdx.x + 0] = 1;
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
: segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
}
s_inf[threadIdx.x + 0] = 1;
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
// Load input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
s_inf[threadIdx.x] = 0;
}
// Prepare for bitonic merge by inversing the ordering
if (threadIdx.x < lenSrcB) {
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
}
//"Extended" bitonic merge
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
// Load input data
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
s_key[pos + stride], s_val[pos + stride],
s_inf[pos + stride], sortDir);
}
// Store sorted data
cg::sync(cta);
d_DstKey += startDst;
d_DstVal += startDst;
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
s_inf[threadIdx.x] = 0;
}
if (threadIdx.x < lenSrcA) {
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
}
// Prepare for bitonic merge by inversing the ordering
if (threadIdx.x < lenSrcB) {
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
}
if (threadIdx.x < lenSrcB) {
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
//"Extended" bitonic merge
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0],
s_val[pos + 0],
s_inf[pos + 0],
s_key[pos + stride],
s_val[pos + stride],
s_inf[pos + stride],
sortDir);
}
// Store sorted data
cg::sync(cta);
d_DstKey += startDst;
d_DstVal += startDst;
if (threadIdx.x < lenSrcA) {
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB, uint stride,
uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else {
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
if (sortDir) {
bitonicMergeElementaryIntervalsKernel<1U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
}
else {
bitonicMergeElementaryIntervalsKernel<0U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}

View File

@ -26,96 +26,94 @@
*/
#include <assert.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
StopWatchInterface *hTimer = NULL;
int main(int argc, char **argv)
{
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
StopWatchInterface *hTimer = NULL;
const uint N = 4 * 1048576;
const uint DIR = 1;
const uint numValues = 65536;
const uint N = 4 * 1048576;
const uint DIR = 1;
const uint numValues = 65536;
printf("%s Starting...\n\n", argv[0]);
printf("%s Starting...\n\n", argv[0]);
int dev = findCudaDevice(argc, (const char **)argv);
int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1) {
return EXIT_FAILURE;
}
if (dev == -1) {
return EXIT_FAILURE;
}
printf("Allocating and initializing host arrays...\n\n");
sdkCreateTimer(&hTimer);
h_SrcKey = (uint *)malloc(N * sizeof(uint));
h_SrcVal = (uint *)malloc(N * sizeof(uint));
h_DstKey = (uint *)malloc(N * sizeof(uint));
h_DstVal = (uint *)malloc(N * sizeof(uint));
printf("Allocating and initializing host arrays...\n\n");
sdkCreateTimer(&hTimer);
h_SrcKey = (uint *)malloc(N * sizeof(uint));
h_SrcVal = (uint *)malloc(N * sizeof(uint));
h_DstKey = (uint *)malloc(N * sizeof(uint));
h_DstVal = (uint *)malloc(N * sizeof(uint));
srand(2009);
srand(2009);
for (uint i = 0; i < N; i++) {
h_SrcKey[i] = rand() % numValues;
}
for (uint i = 0; i < N; i++) {
h_SrcKey[i] = rand() % numValues;
}
fillValues(h_SrcVal, N);
fillValues(h_SrcVal, N);
printf("Allocating and initializing CUDA arrays...\n\n");
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
checkCudaErrors(
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
printf("Allocating and initializing CUDA arrays...\n\n");
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
printf("Initializing GPU merge sort...\n");
initMergeSort();
printf("Initializing GPU merge sort...\n");
initMergeSort();
printf("Running GPU merge sort...\n");
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
printf("Running GPU merge sort...\n");
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
printf("Reading back GPU merge sort results...\n");
checkCudaErrors(
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
printf("Reading back GPU merge sort results...\n");
checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
printf("Inspecting the results...\n");
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
printf("Inspecting the results...\n");
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
printf("Shutting down...\n");
closeMergeSort();
sdkDeleteTimer(&hTimer);
checkCudaErrors(cudaFree(d_SrcVal));
checkCudaErrors(cudaFree(d_SrcKey));
checkCudaErrors(cudaFree(d_BufVal));
checkCudaErrors(cudaFree(d_BufKey));
checkCudaErrors(cudaFree(d_DstVal));
checkCudaErrors(cudaFree(d_DstKey));
free(h_DstVal);
free(h_DstKey);
free(h_SrcVal);
free(h_SrcKey);
printf("Shutting down...\n");
closeMergeSort();
sdkDeleteTimer(&hTimer);
checkCudaErrors(cudaFree(d_SrcVal));
checkCudaErrors(cudaFree(d_SrcKey));
checkCudaErrors(cudaFree(d_BufVal));
checkCudaErrors(cudaFree(d_BufKey));
checkCudaErrors(cudaFree(d_DstVal));
checkCudaErrors(cudaFree(d_DstKey));
free(h_DstVal);
free(h_DstKey);
free(h_SrcVal);
free(h_SrcKey);
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -39,491 +39,499 @@
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
static inline __host__ __device__ uint getSampleCount(uint dividend) {
return iDivUp(dividend, SAMPLE_STRIDE);
}
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
#define W (sizeof(uint) * 8)
static inline __device__ uint nextPowerOfTwo(uint x) {
/*
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
*/
return 1U << (W - __clz(x - 1));
static inline __device__ uint nextPowerOfTwo(uint x)
{
/*
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
*/
return 1U << (W - __clz(x - 1));
}
template <uint sortDir>
static inline __device__ uint binarySearchInclusive(uint val, uint *data,
uint L, uint stride) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) ||
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
}
}
return pos;
}
template <uint sortDir>
static inline __device__ uint binarySearchExclusive(uint val, uint *data,
uint L, uint stride) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) ||
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
}
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
template <uint sortDir>
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint arrayLength) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
uint lPos = threadIdx.x & (stride - 1);
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
uint lPos = threadIdx.x & (stride - 1);
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
cg::sync(cta);
uint keyA = baseKey[lPos + 0];
uint valA = baseVal[lPos + 0];
uint keyB = baseKey[lPos + stride];
uint valB = baseVal[lPos + stride];
uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
cg::sync(cta);
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
cg::sync(cta);
uint keyA = baseKey[lPos + 0];
uint valA = baseVal[lPos + 0];
uint keyB = baseKey[lPos + stride];
uint valB = baseVal[lPos + stride];
uint posA =
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
lPos;
uint posB =
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
lPos;
cg::sync(cta);
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
uint *d_SrcVal, uint batchSize, uint arrayLength,
uint sortDir) {
if (arrayLength < 2) {
return;
}
static void mergeSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir)
{
if (arrayLength < 2) {
return;
}
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
if (sortDir) {
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
} else {
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
}
if (sortDir) {
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
}
else {
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
template <uint sortDir>
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
uint *d_SrcKey, uint stride, uint N,
uint threadCount) {
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
__global__ void
generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
{
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) {
return;
}
if (pos >= threadCount) {
return;
}
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_SrcKey += segmentBase;
d_RanksA += segmentBase / SAMPLE_STRIDE;
d_RanksB += segmentBase / SAMPLE_STRIDE;
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_SrcKey += segmentBase;
d_RanksA += segmentBase / SAMPLE_STRIDE;
d_RanksB += segmentBase / SAMPLE_STRIDE;
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA) {
d_RanksA[i] = i * SAMPLE_STRIDE;
d_RanksB[i] = binarySearchExclusive<sortDir>(
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesA) {
d_RanksA[i] = i * SAMPLE_STRIDE;
d_RanksB[i] = binarySearchExclusive<sortDir>(
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesB) {
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
nextPowerOfTwo(segmentElementsA));
}
if (i < segmentSamplesB) {
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
}
}
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint threadCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
if (sortDir) {
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
} else {
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
}
if (sortDir) {
generateSampleRanksKernel<1U>
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
}
else {
generateSampleRanksKernel<0U>
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
uint stride, uint N,
uint threadCount) {
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
{
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) {
return;
}
if (pos >= threadCount) {
return;
}
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_Ranks += (pos - i) * 2;
d_Limits += (pos - i) * 2;
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_Ranks += (pos - i) * 2;
d_Limits += (pos - i) * 2;
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA) {
uint dstPos = binarySearchExclusive<1U>(
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
nextPowerOfTwo(segmentSamplesB)) +
i;
d_Limits[dstPos] = d_Ranks[i];
}
if (i < segmentSamplesA) {
uint dstPos = binarySearchExclusive<1U>(
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
+ i;
d_Limits[dstPos] = d_Ranks[i];
}
if (i < segmentSamplesB) {
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
d_Ranks, segmentSamplesA,
nextPowerOfTwo(segmentSamplesA)) +
i;
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
}
if (i < segmentSamplesB) {
uint dstPos = binarySearchInclusive<1U>(
d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
+ i;
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
}
}
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
uint *d_RanksA, uint *d_RanksB, uint stride,
uint N) {
uint lastSegmentElements = N % (2 * stride);
uint threadCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
{
uint lastSegmentElements = N % (2 * stride);
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
d_LimitsA, d_RanksA, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
d_LimitsB, d_RanksB, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals
////////////////////////////////////////////////////////////////////////////////
template <uint sortDir>
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
uint *srcAVal, uint *srcBKey, uint *srcBVal,
uint lenA, uint nPowTwoLenA, uint lenB,
uint nPowTwoLenB, cg::thread_block cta) {
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
inline __device__ void merge(uint *dstKey,
uint *dstVal,
uint *srcAKey,
uint *srcAVal,
uint *srcBKey,
uint *srcBVal,
uint lenA,
uint nPowTwoLenA,
uint lenB,
uint nPowTwoLenB,
cg::thread_block cta)
{
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
if (threadIdx.x < lenA) {
keyA = srcAKey[threadIdx.x];
valA = srcAVal[threadIdx.x];
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
threadIdx.x;
}
if (threadIdx.x < lenA) {
keyA = srcAKey[threadIdx.x];
valA = srcAVal[threadIdx.x];
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
}
if (threadIdx.x < lenB) {
keyB = srcBKey[threadIdx.x];
valB = srcBVal[threadIdx.x];
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
threadIdx.x;
}
if (threadIdx.x < lenB) {
keyB = srcBKey[threadIdx.x];
valB = srcBVal[threadIdx.x];
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
}
cg::sync(cta);
cg::sync(cta);
if (threadIdx.x < lenA) {
dstKey[dstPosA] = keyA;
dstVal[dstPosA] = valA;
}
if (threadIdx.x < lenA) {
dstKey[dstPosA] = keyA;
dstVal[dstPosA] = valA;
}
if (threadIdx.x < lenB) {
dstKey[dstPosB] = keyB;
dstVal[dstPosB] = valB;
}
if (threadIdx.x < lenB) {
dstKey[dstPosB] = keyB;
dstVal[dstPosB] = valB;
}
}
template <uint sortDir>
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB,
uint stride, uint N) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
// Set up threadblock-wide parameters
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
// Set up threadblock-wide parameters
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = getSampleCount(segmentElementsA);
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = getSampleCount(segmentElementsA);
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
: segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
startDstA = startSrcA + startSrcB;
startDstB = startDstA + lenSrcA;
}
// Load main input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] =
d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] =
d_SrcVal[stride + startSrcB + threadIdx.x];
}
// Merge data in shared memory
cg::sync(cta);
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
SAMPLE_STRIDE, cta);
// Store merged data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else {
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint batchSize, uint arrayLength,
uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB, uint stride,
uint N, uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void) {
checkCudaErrors(
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}
extern "C" void closeMergeSort(void) {
checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA));
}
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
uint N, uint sortDir) {
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
uint *ikey, *ival, *okey, *oval;
if (stageCount & 1) {
ikey = d_BufKey;
ival = d_BufVal;
okey = d_DstKey;
oval = d_DstVal;
} else {
ikey = d_DstKey;
ival = d_DstVal;
okey = d_BufKey;
oval = d_BufVal;
}
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
checkCudaErrors(cudaMemcpy(
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
startDstA = startSrcA + startSrcB;
startDstB = startDstA + lenSrcA;
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
// Load main input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
}
// Merge data in shared memory
cg::sync(cta);
merge<sortDir>(s_key,
s_val,
s_key + 0,
s_val + 0,
s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE,
lenSrcA,
SAMPLE_STRIDE,
lenSrcB,
SAMPLE_STRIDE,
cta);
// Store merged data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
static void mergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
mergeElementaryIntervalsKernel<1U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
}
else {
mergeElementaryIntervalsKernel<0U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}
extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void)
{
checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}
extern "C" void closeMergeSort(void)
{
checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA));
}
extern "C" void mergeSort(uint *d_DstKey,
uint *d_DstVal,
uint *d_BufKey,
uint *d_BufVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint N,
uint sortDir)
{
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
uint *ikey, *ival, *okey, *oval;
if (stageCount & 1) {
ikey = d_BufKey;
ival = d_BufVal;
okey = d_DstKey;
oval = d_DstVal;
}
else {
ikey = d_DstKey;
ival = d_DstVal;
okey = d_BufKey;
oval = d_BufVal;
}
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
}

View File

@ -31,19 +31,17 @@
typedef unsigned int uint;
#define SHARED_SIZE_LIMIT 1024U
#define SAMPLE_STRIDE 128
#define SAMPLE_STRIDE 128
////////////////////////////////////////////////////////////////////////////////
// Extensive sort validation routine
////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
uint arrayLength, uint numValues,
uint sortDir);
extern "C" uint
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
extern "C" void fillValues(uint *val, uint N);
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
uint batchSize, uint arrayLength);
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
////////////////////////////////////////////////////////////////////////////////
// CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
extern "C" void closeMergeSort(void);
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
uint sortDir);
extern "C" void
mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
////////////////////////////////////////////////////////////////////////////////
// CPU "emulation"
////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
uint sortDir);
extern "C" void
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);

View File

@ -29,329 +29,335 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
static void checkOrder(uint *data, uint N, uint sortDir) {
if (N <= 1) {
return;
}
for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) ||
(!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE);
static void checkOrder(uint *data, uint N, uint sortDir)
{
if (N <= 1) {
return;
}
for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE);
}
}
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
static uint getSampleCount(uint dividend) {
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
: (dividend / SAMPLE_STRIDE);
static uint getSampleCount(uint dividend)
{
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
}
static uint nextPowerOfTwo(uint x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
static uint nextPowerOfTwo(uint x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) ||
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
}
}
return pos;
}
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) ||
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
}
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: find sample ranks in each segment
////////////////////////////////////////////////////////////////////////////////
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint sampleCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
if (i < nA) {
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
srcKey + segmentBase + stride, lenB, sortDir);
if (i < nA) {
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
}
if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
}
}
if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
binarySearchInclusive(
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
srcKey + segmentBase, lenA, sortDir);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: merge ranks and indices to derive elementary intervals
////////////////////////////////////////////////////////////////////////////////
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
uint N) {
uint lastSegmentElements = N % (2 * stride);
uint sampleCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
{
uint lastSegmentElements = N % (2 * stride);
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
if (i < nA) {
uint dstPosA =
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
ranks + (segmentBase + stride) / SAMPLE_STRIDE,
nB, 1) +
i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
if (i < nA) {
uint dstPosA =
binarySearchExclusive(
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
+ i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
}
if (i < nB) {
uint dstPosA =
binarySearchInclusive(
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
+ i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
}
}
if (i < nB) {
uint dstPosA = binarySearchInclusive(
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
////////////////////////////////////////////////////////////////////////////////
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
uint sortDir) {
checkOrder(srcAKey, lenA, sortDir);
checkOrder(srcBKey, lenB, sortDir);
static void merge(uint *dstKey,
uint *dstVal,
uint *srcAKey,
uint *srcAVal,
uint *srcBKey,
uint *srcBVal,
uint lenA,
uint lenB,
uint sortDir)
{
checkOrder(srcAKey, lenA, sortDir);
checkOrder(srcBKey, lenB, sortDir);
for (uint i = 0; i < lenA; i++) {
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcAKey[i];
dstVal[dstPos] = srcAVal[i];
}
for (uint i = 0; i < lenA; i++) {
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcAKey[i];
dstVal[dstPos] = srcAVal[i];
}
for (uint i = 0; i < lenB; i++) {
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcBKey[i];
dstVal[dstPos] = srcBVal[i];
}
for (uint i = 0; i < lenB; i++) {
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcBKey[i];
dstVal[dstPos] = srcBVal[i];
}
}
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
uint *srcVal, uint *limitsA, uint *limitsB,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
static void mergeElementaryIntervals(uint *dstKey,
uint *dstVal,
uint *srcKey,
uint *srcVal,
uint *limitsA,
uint *limitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
for (uint pos = 0; pos < mergePairs; pos++) {
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
for (uint pos = 0; pos < mergePairs; pos++) {
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint n = nA + nB;
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint n = nA + nB;
const uint startPosA = limitsA[pos];
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
const uint startPosB = limitsB[pos];
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
const uint startPosDst = startPosA + startPosB;
const uint startPosA = limitsA[pos];
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
const uint startPosB = limitsB[pos];
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
const uint startPosDst = startPosA + startPosB;
assert(startPosA <= endPosA && endPosA <= lenA);
assert(startPosB <= endPosB && endPosB <= lenB);
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
assert(startPosA <= endPosA && endPosA <= lenA);
assert(startPosB <= endPosB && endPosB <= lenB);
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
merge(dstKey + segmentBase + startPosDst,
dstVal + segmentBase + startPosDst,
(srcKey + segmentBase + 0) + startPosA,
(srcVal + segmentBase + 0) + startPosA,
(srcKey + segmentBase + stride) + startPosB,
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
endPosB - startPosB, sortDir);
}
merge(dstKey + segmentBase + startPosDst,
dstVal + segmentBase + startPosDst,
(srcKey + segmentBase + 0) + startPosA,
(srcVal + segmentBase + 0) + startPosA,
(srcKey + segmentBase + stride) + startPosB,
(srcVal + segmentBase + stride) + startPosB,
endPosA - startPosA,
endPosB - startPosB,
sortDir);
}
}
////////////////////////////////////////////////////////////////////////////////
// Retarded bubble sort
////////////////////////////////////////////////////////////////////////////////
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
if (N <= 1) {
return;
}
for (uint bottom = 0; bottom < N - 1; bottom++) {
uint savePos = bottom;
uint saveKey = key[bottom];
for (uint i = bottom + 1; i < N; i++)
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
savePos = i;
saveKey = key[i];
}
if (savePos != bottom) {
uint t;
t = key[savePos];
key[savePos] = key[bottom];
key[bottom] = t;
t = val[savePos];
val[savePos] = val[bottom];
val[bottom] = t;
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
{
if (N <= 1) {
return;
}
for (uint bottom = 0; bottom < N - 1; bottom++) {
uint savePos = bottom;
uint saveKey = key[bottom];
for (uint i = bottom + 1; i < N; i++)
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
savePos = i;
saveKey = key[i];
}
if (savePos != bottom) {
uint t;
t = key[savePos];
key[savePos] = key[bottom];
key[bottom] = t;
t = val[savePos];
val[savePos] = val[bottom];
val[bottom] = t;
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Interface function
////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
uint sortDir) {
uint *ikey, *ival, *okey, *oval;
uint stageCount = 0;
extern "C" void
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
{
uint *ikey, *ival, *okey, *oval;
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
if (stageCount & 1) {
ikey = bufKey;
ival = bufVal;
okey = dstKey;
oval = dstVal;
} else {
ikey = dstKey;
ival = dstVal;
okey = bufKey;
oval = bufVal;
}
printf("Bottom-level sort...\n");
memcpy(ikey, srcKey, N * sizeof(uint));
memcpy(ival, srcVal, N * sizeof(uint));
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
sortDir);
}
printf("Merge...\n");
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(limitsA, ranksA, stride, N);
mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint));
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint));
if (stageCount & 1) {
ikey = bufKey;
ival = bufVal;
okey = dstKey;
oval = dstVal;
}
else {
ikey = dstKey;
ival = dstVal;
okey = bufKey;
oval = bufVal;
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
printf("Bottom-level sort...\n");
memcpy(ikey, srcKey, N * sizeof(uint));
memcpy(ival, srcVal, N * sizeof(uint));
free(limitsB);
free(limitsA);
free(ranksB);
free(ranksA);
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
}
printf("Merge...\n");
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(limitsA, ranksA, stride, N);
mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
memcpy(
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
memcpy(
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
free(limitsB);
free(limitsA);
free(ranksB);
free(ranksA);
}

View File

@ -29,104 +29,100 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Validate sorted keys array (check for integrity and proper order)
////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
uint arrayLength, uint numValues,
uint sortDir) {
uint *srcHist;
uint *resHist;
extern "C" uint
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
{
uint *srcHist;
uint *resHist;
if (arrayLength < 2) {
printf("validateSortedKeys(): arrays too short, exiting...\n");
return 1;
}
printf("...inspecting keys array: ");
srcHist = (uint *)malloc(numValues * sizeof(uint));
resHist = (uint *)malloc(numValues * sizeof(uint));
int flag = 1;
for (uint j = 0; j < batchSize;
j++, srcKey += arrayLength, resKey += arrayLength) {
// Build histograms for keys arrays
memset(srcHist, 0, numValues * sizeof(uint));
memset(resHist, 0, numValues * sizeof(uint));
for (uint i = 0; i < arrayLength; i++) {
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
srcHist[srcKey[i]]++;
resHist[resKey[i]]++;
} else {
fprintf(
stderr,
"***Set %u source/result key arrays are not limited properly***\n",
j);
flag = 0;
goto brk;
}
if (arrayLength < 2) {
printf("validateSortedKeys(): arrays too short, exiting...\n");
return 1;
}
// Compare the histograms
for (uint i = 0; i < numValues; i++)
if (srcHist[i] != resHist[i]) {
fprintf(stderr,
"***Set %u source/result keys histograms do not match***\n", j);
flag = 0;
goto brk;
}
printf("...inspecting keys array: ");
srcHist = (uint *)malloc(numValues * sizeof(uint));
resHist = (uint *)malloc(numValues * sizeof(uint));
// Finally check the ordering
for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) ||
(!sortDir && (resKey[i] < resKey[i + 1]))) {
fprintf(stderr,
"***Set %u result key array is not ordered properly***\n", j);
flag = 0;
goto brk;
}
}
int flag = 1;
for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
// Build histograms for keys arrays
memset(srcHist, 0, numValues * sizeof(uint));
memset(resHist, 0, numValues * sizeof(uint));
for (uint i = 0; i < arrayLength; i++) {
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
srcHist[srcKey[i]]++;
resHist[resKey[i]]++;
}
else {
fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
flag = 0;
goto brk;
}
}
// Compare the histograms
for (uint i = 0; i < numValues; i++)
if (srcHist[i] != resHist[i]) {
fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
flag = 0;
goto brk;
}
// Finally check the ordering
for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
flag = 0;
goto brk;
}
}
brk:
free(resHist);
free(srcHist);
free(resHist);
free(srcHist);
if (flag) printf("OK\n");
if (flag)
printf("OK\n");
return flag;
return flag;
}
////////////////////////////////////////////////////////////////////////////////
// Value validation / stability check routines
////////////////////////////////////////////////////////////////////////////////
extern "C" void fillValues(uint *val, uint N) {
for (uint i = 0; i < N; i++) val[i] = i;
extern "C" void fillValues(uint *val, uint N)
{
for (uint i = 0; i < N; i++)
val[i] = i;
}
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
uint batchSize, uint arrayLength) {
int correctFlag = 1, stableFlag = 1;
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
{
int correctFlag = 1, stableFlag = 1;
printf("...inspecting keys and values array: ");
printf("...inspecting keys and values array: ");
for (uint i = 0; i < batchSize;
i++, resKey += arrayLength, resVal += arrayLength) {
for (uint j = 0; j < arrayLength; j++) {
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
for (uint j = 0; j < arrayLength; j++) {
if (resKey[j] != srcKey[resVal[j]])
correctFlag = 0;
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
(resVal[j] > resVal[j + 1]))
stableFlag = 0;
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
stableFlag = 0;
}
}
}
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
printf(stableFlag ? "...stability property: stable!\n"
: "...stability property: NOT stable\n");
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
return correctFlag;
return correctFlag;
}

View File

@ -11,8 +11,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaDeviceGetAttribute, cudaMallocHost, cud
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -29,106 +29,105 @@
#include <stdio.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda/barrier>
#include <cooperative_groups.h>
#include <cuda/barrier>
#include <cuda_runtime.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check
#include <helper_cuda.h> // helper functions for CUDA error check
namespace cg = cooperative_groups;
#if __CUDA_ARCH__ >= 700
template <bool writeSquareRoot>
__device__ void reduceBlockData(
cuda::barrier<cuda::thread_scope_block> &barrier,
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
extern __shared__ double tmp[];
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
threadSum += tile32.shfl_down(threadSum, offset);
}
if (tile32.thread_rank() == 0) {
tmp[tile32.meta_group_rank()] = threadSum;
}
auto token = barrier.arrive();
barrier.wait(std::move(token));
// The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size()
? tmp[tile32.thread_rank()]
: 0.0;
__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
cg::thread_block_tile<32> &tile32,
double &threadSum,
double *result)
{
extern __shared__ double tmp[];
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
beta += tile32.shfl_down(beta, offset);
threadSum += tile32.shfl_down(threadSum, offset);
}
if (tile32.thread_rank() == 0) {
tmp[tile32.meta_group_rank()] = threadSum;
}
if (tile32.thread_rank() == 0) {
if (writeSquareRoot)
*result = sqrt(beta);
else
*result = beta;
auto token = barrier.arrive();
barrier.wait(std::move(token));
// The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
beta += tile32.shfl_down(beta, offset);
}
if (tile32.thread_rank() == 0) {
if (writeSquareRoot)
*result = sqrt(beta);
else
*result = beta;
}
}
}
}
#endif
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
double *partialResults, int size) {
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
{
#if __CUDA_ARCH__ >= 700
#pragma diag_suppress static_var_with_dynamic_init
cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
;
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
;
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
if (threadIdx.x == 0) {
init(&barrier, blockDim.x);
}
cg::sync(cta);
double threadSum = 0.0;
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
threadSum += (double)(vecA[i] * vecB[i]);
}
// Each thread block performs reduction of partial dotProducts and writes to
// global mem.
reduceBlockData<false>(barrier, tile32, threadSum,
&partialResults[blockIdx.x]);
cg::sync(grid);
// One block performs the final summation of partial dot products
// of all the thread blocks and writes the sqrt of final dot product.
if (blockIdx.x == 0) {
threadSum = 0.0;
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
threadSum += partialResults[i];
if (threadIdx.x == 0) {
init(&barrier, blockDim.x);
}
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
}
cg::sync(grid);
cg::sync(cta);
const double finalValue = partialResults[0];
double threadSum = 0.0;
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
threadSum += (double)(vecA[i] * vecB[i]);
}
// Perform normalization of vecA & vecB.
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
vecA[i] = (float)vecA[i] / finalValue;
vecB[i] = (float)vecB[i] / finalValue;
}
// Each thread block performs reduction of partial dotProducts and writes to
// global mem.
reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
cg::sync(grid);
// One block performs the final summation of partial dot products
// of all the thread blocks and writes the sqrt of final dot product.
if (blockIdx.x == 0) {
threadSum = 0.0;
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
threadSum += partialResults[i];
}
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
}
cg::sync(grid);
const double finalValue = partialResults[0];
// Perform normalization of vecA & vecB.
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
vecA[i] = (float)vecA[i] / finalValue;
vecB[i] = (float)vecB[i] / finalValue;
}
#endif
}
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", argv[0]);
int main(int argc, char **argv)
{
printf("%s starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
int major = 0;
checkCudaErrors(
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
int major = 0;
checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
if (major < 7) {
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
exit(EXIT_WAIVED);
}
int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
cudaDevAttrCooperativeLaunch, dev));
if (!supportsCooperativeLaunch) {
printf(
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"Waiving the run\n",
dev);
exit(EXIT_WAIVED);
}
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
float *vecA, *d_vecA;
float *vecB, *d_vecB;
double *d_partialResults;
int size = 10000000;
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
float baseVal = 2.0;
for (int i = 0; i < size; i++) {
vecA[i] = vecB[i] = baseVal;
}
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0;
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
int numBlocksPerSm = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute(
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf(
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n",
minGridSize, blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
(void *)&d_partialResults, (void *)&size};
checkCudaErrors(
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
unsigned int matches = 0;
for (int i = 0; i < size; i++) {
if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i);
break;
} else {
matches++;
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
if (major < 7) {
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
exit(EXIT_WAIVED);
}
}
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
checkCudaErrors(cudaFree(d_vecA));
checkCudaErrors(cudaFree(d_vecB));
checkCudaErrors(cudaFree(d_partialResults));
int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
checkCudaErrors(cudaFreeHost(vecA));
checkCudaErrors(cudaFreeHost(vecB));
return matches == size;
if (!supportsCooperativeLaunch) {
printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"Waiving the run\n",
dev);
exit(EXIT_WAIVED);
}
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
{
float *vecA, *d_vecA;
float *vecB, *d_vecB;
double *d_partialResults;
int size = 10000000;
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
float baseVal = 2.0;
for (int i = 0; i < size; i++) {
vecA[i] = vecB[i] = baseVal;
}
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0;
checkCudaErrors(
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
int numBlocksPerSm = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n",
minGridSize,
blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
checkCudaErrors(cudaLaunchCooperativeKernel(
(void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
unsigned int matches = 0;
for (int i = 0; i < size; i++) {
if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i);
break;
}
else {
matches++;
}
}
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
checkCudaErrors(cudaFree(d_vecA));
checkCudaErrors(cudaFree(d_vecB));
checkCudaErrors(cudaFree(d_partialResults));
checkCudaErrors(cudaFreeHost(vecA));
checkCudaErrors(cudaFreeHost(vecB));
return matches == size;
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Removes -DNDEBUG For Print specific logs in this sample.

View File

@ -27,6 +27,6 @@ cudaDeviceSynchronize, cudaGetErrorString
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -34,17 +34,17 @@
#endif
// Includes, system
#include <stdio.h>
#include <cassert>
#include <stdio.h>
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check
#include <helper_cuda.h> // helper functions for CUDA error check
const char *sampleName = "simpleAssert";
@ -58,9 +58,10 @@ bool testResult = true;
//! Tests assert function.
//! Thread whose id > N will print assertion failed error message.
////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int N) {
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
__global__ void testKernel(int N)
{
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
}
////////////////////////////////////////////////////////////////////////////////
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", sampleName);
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
runTest(argc, argv);
runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName,
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void runTest(int argc, char **argv) {
int Nblocks = 2;
int Nthreads = 32;
cudaError_t error;
void runTest(int argc, char **argv)
{
int Nblocks = 2;
int Nthreads = 32;
cudaError_t error;
#ifndef _WIN32
utsname OS_System_Type;
uname(&OS_System_Type);
utsname OS_System_Type;
uname(&OS_System_Type);
printf("OS_System_Type.release = %s\n", OS_System_Type.release);
printf("OS_System_Type.release = %s\n", OS_System_Type.release);
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
printf("simpleAssert is not current supported on Mac OSX\n\n");
exit(EXIT_SUCCESS);
} else {
printf("OS Info: <%s>\n\n", OS_System_Type.version);
}
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
printf("simpleAssert is not current supported on Mac OSX\n\n");
exit(EXIT_SUCCESS);
}
else {
printf("OS Info: <%s>\n\n", OS_System_Type.version);
}
#endif
// This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv);
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
dim3 dimGrid(Nblocks);
dim3 dimBlock(Nthreads);
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
dim3 dimGrid(Nblocks);
dim3 dimBlock(Nthreads);
printf("Launch kernel to generate assertion failures\n");
testKernel<<<dimGrid, dimBlock>>>(60);
printf("Launch kernel to generate assertion failures\n");
testKernel<<<dimGrid, dimBlock>>>(60);
// Synchronize (flushes assert output).
printf("\n-- Begin assert output\n\n");
error = cudaDeviceSynchronize();
printf("\n-- End assert output\n\n");
// Synchronize (flushes assert output).
printf("\n-- Begin assert output\n\n");
error = cudaDeviceSynchronize();
printf("\n-- End assert output\n\n");
// Check for errors and failed asserts in asynchronous kernel launch.
if (error == cudaErrorAssert) {
printf(
"Device assert failed as expected, "
"CUDA error message is: %s\n\n",
cudaGetErrorString(error));
}
// Check for errors and failed asserts in asynchronous kernel launch.
if (error == cudaErrorAssert) {
printf("Device assert failed as expected, "
"CUDA error message is: %s\n\n",
cudaGetErrorString(error));
}
testResult = error == cudaErrorAssert;
testResult = error == cudaErrorAssert;
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -30,7 +30,7 @@ cuModuleGetFunction, cuLaunchKernel, cuCtxSynchronize
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -34,15 +34,16 @@
#endif
// Includes, system
#include <stdio.h>
#include <cassert>
#include <stdio.h>
// Includes CUDA
#include <cuda_runtime.h>
#include "nvrtc_helper.h"
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
const char *sampleName = "simpleAssert_nvrtc";
@ -58,56 +59,63 @@ void runTest(int argc, char **argv);
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", sampleName);
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
runTest(argc, argv);
runTest(argc, argv);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void runTest(int argc, char **argv) {
int Nblocks = 2;
int Nthreads = 32;
void runTest(int argc, char **argv)
{
int Nblocks = 2;
int Nthreads = 32;
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
dim3 dimGrid(Nblocks);
dim3 dimBlock(Nthreads);
dim3 dimGrid(Nblocks);
dim3 dimBlock(Nthreads);
printf("Launch kernel to generate assertion failures\n");
char *cubin, *kernel_file;
size_t cubinSize;
printf("Launch kernel to generate assertion failures\n");
char *cubin, *kernel_file;
size_t cubinSize;
kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
int count = 60;
void *args[] = {(void *)&count};
int count = 60;
void *args[] = {(void *)&count};
checkCudaErrors(cuLaunchKernel(
kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
dimBlock.x, dimBlock.y, dimBlock.z, /* block dim */
0, 0, /* shared mem, stream */
&args[0], /* arguments */
0));
checkCudaErrors(cuLaunchKernel(kernel_addr,
dimGrid.x,
dimGrid.y,
dimGrid.z, /* grid dim */
dimBlock.x,
dimBlock.y,
dimBlock.z, /* block dim */
0,
0, /* shared mem, stream */
&args[0], /* arguments */
0));
// Synchronize (flushes assert output).
printf("\n-- Begin assert output\n\n");
CUresult res = cuCtxSynchronize();
// Synchronize (flushes assert output).
printf("\n-- Begin assert output\n\n");
CUresult res = cuCtxSynchronize();
printf("\n-- End assert output\n\n");
printf("\n-- End assert output\n\n");
// Check for errors and failed asserts in asynchronous kernel launch.
if (res == CUDA_ERROR_ASSERT) {
printf("Device assert failed as expected\n");
}
// Check for errors and failed asserts in asynchronous kernel launch.
if (res == CUDA_ERROR_ASSERT) {
printf("Device assert failed as expected\n");
}
testResult = res == CUDA_ERROR_ASSERT;
testResult = res == CUDA_ERROR_ASSERT;
}

View File

@ -32,7 +32,8 @@
//! Thread whose id > N will print assertion failed error message.
////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void testKernel(int N) {
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
extern "C" __global__ void testKernel(int N)
{
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
}

View File

@ -11,8 +11,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -30,10 +30,10 @@
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN
@ -45,10 +45,10 @@
#include <cuda_runtime.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check
#include <helper_cuda.h> // helper functions for CUDA error check
// Includes, kernels
#include "simpleAtomicIntrinsics_kernel.cuh"
@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", sampleName);
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
runTest(argc, argv);
runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName,
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
cudaStream_t stream;
// This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv);
void runTest(int argc, char **argv)
{
cudaStream_t stream;
// This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv);
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData;
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData;
// allocate mem for the result on host side
int *hOData;
checkCudaErrors(cudaMallocHost(&hOData, memSize));
// allocate mem for the result on host side
int *hOData;
checkCudaErrors(cudaMallocHost(&hOData, memSize));
// initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
// initialize the memory
for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
// To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff;
// To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// allocate device memory for result
int *dOData;
checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
// copy host memory to device to initialize to zero
checkCudaErrors(
cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// allocate device memory for result
int *dOData;
checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
// copy host memory to device to initialize to zero
checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
// execute the kernel
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
// execute the kernel
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
// Copy result from device to host
checkCudaErrors(
cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
// Copy result from device to host
checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
// Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks);
// Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks);
// Cleanup memory
checkCudaErrors(cudaFreeHost(hOData));
checkCudaErrors(cudaFree(dOData));
// Cleanup memory
checkCudaErrors(cudaFreeHost(hOData));
checkCudaErrors(cudaFree(dOData));
}

View File

@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len);
//! @param idata input data as provided to device
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
int computeGold(int *gpuData, const int len) {
int val = 0;
int computeGold(int *gpuData, const int len)
{
int val = 0;
for (int i = 0; i < len; ++i) {
val += 10;
}
if (val != gpuData[0]) {
printf("atomicAdd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
val -= 10;
}
if (val != gpuData[1]) {
printf("atomicSub failed\n");
return false;
}
bool found = false;
for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
for (int i = 0; i < len; ++i) {
val += 10;
}
}
if (!found) {
printf("atomicExch failed\n");
return false;
}
val = -(1 << 8);
for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
if (val != gpuData[0]) {
printf("atomicAdd failed\n");
return false;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0;
val = 0xff;
for (int i = 0; i < len; ++i) {
val -= 10;
}
for (int i = 0; i < len; ++i) {
// 9th element should be 1
val &= (2 * i + 7);
}
if (val != gpuData[1]) {
printf("atomicSub failed\n");
return false;
}
if (val != gpuData[8]) {
printf("atomicAnd failed\n");
return false;
}
bool found = false;
val = 0;
for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
}
}
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
if (!found) {
printf("atomicExch failed\n");
return false;
}
if (val != gpuData[9]) {
printf("atomicOr failed\n");
return false;
}
val = -(1 << 8);
val = 0xff;
for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
if (val != gpuData[10]) {
printf("atomicXor failed\n");
return false;
}
val = 1 << 8;
return true;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 9th element should be 1
val &= (2 * i + 7);
}
if (val != gpuData[8]) {
printf("atomicAnd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
if (val != gpuData[9]) {
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != gpuData[10]) {
printf("atomicXor failed\n");
return false;
}
return true;
}

View File

@ -35,48 +35,49 @@
//! @param g_idata input data in global memory
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int *g_odata) {
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
__global__ void testKernel(int *g_odata)
{
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// Test various atomic instructions
// Test various atomic instructions
// Arithmetic atomic instructions
// Arithmetic atomic instructions
// Atomic addition
atomicAdd(&g_odata[0], 10);
// Atomic addition
atomicAdd(&g_odata[0], 10);
// Atomic subtraction (final should be 0)
atomicSub(&g_odata[1], 10);
// Atomic subtraction (final should be 0)
atomicSub(&g_odata[1], 10);
// Atomic exchange
atomicExch(&g_odata[2], tid);
// Atomic exchange
atomicExch(&g_odata[2], tid);
// Atomic maximum
atomicMax(&g_odata[3], tid);
// Atomic maximum
atomicMax(&g_odata[3], tid);
// Atomic minimum
atomicMin(&g_odata[4], tid);
// Atomic minimum
atomicMin(&g_odata[4], tid);
// Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137);
// Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137);
// Atomic compare-and-swap
atomicCAS(&g_odata[7], tid - 1, tid);
// Atomic compare-and-swap
atomicCAS(&g_odata[7], tid - 1, tid);
// Bitwise atomic instructions
// Bitwise atomic instructions
// Atomic AND
atomicAnd(&g_odata[8], 2 * tid + 7);
// Atomic AND
atomicAnd(&g_odata[8], 2 * tid + 7);
// Atomic OR
atomicOr(&g_odata[9], 1 << tid);
// Atomic OR
atomicOr(&g_odata[9], 1 << tid);
// Atomic XOR
atomicXor(&g_odata[10], tid);
// Atomic XOR
atomicXor(&g_odata[10], tid);
}
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

View File

@ -30,10 +30,10 @@
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN
@ -46,7 +46,7 @@
#include <nvrtc_helper.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
const char *sampleName = "simpleAtomicIntrinsics_nvrtc";
@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len);
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", sampleName);
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
runTest(argc, argv);
runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName,
testResult ? "OK" : "ERROR!");
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
int dev = 0;
void runTest(int argc, char **argv)
{
int dev = 0;
char *cubin, *kernel_file;
size_t cubinSize;
char *cubin, *kernel_file;
size_t cubinSize;
kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
StopWatchInterface *timer;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData;
unsigned int numThreads = 256;
unsigned int numBlocks = 64;
unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData;
// allocate mem for the result on host side
int *hOData = (int *)malloc(memSize);
// allocate mem for the result on host side
int *hOData = (int *)malloc(memSize);
// initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
// initialize the memory
for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
// To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff;
// To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff;
// allocate device memory for result
CUdeviceptr dOData;
checkCudaErrors(cuMemAlloc(&dOData, memSize));
checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
// allocate device memory for result
CUdeviceptr dOData;
checkCudaErrors(cuMemAlloc(&dOData, memSize));
checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
// execute the kernel
dim3 cudaBlockSize(numThreads, 1, 1);
dim3 cudaGridSize(numBlocks, 1, 1);
// execute the kernel
dim3 cudaBlockSize(numThreads, 1, 1);
dim3 cudaGridSize(numBlocks, 1, 1);
void *arr[] = {(void *)&dOData};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
void *arr[] = {(void *)&dOData};
checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x,
cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
// Copy result from device to host
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
// Copy result from device to host
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
// Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks);
// Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks);
// Cleanup memory
free(hOData);
checkCudaErrors(cuMemFree(dOData));
// Cleanup memory
free(hOData);
checkCudaErrors(cuMemFree(dOData));
}

View File

@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len);
//! @param len number of elements in reference / idata
////////////////////////////////////////////////////////////////////////////////
int computeGold(int *gpuData, const int len) {
int val = 0;
int computeGold(int *gpuData, const int len)
{
int val = 0;
for (int i = 0; i < len; ++i) {
val += 10;
}
if (val != gpuData[0]) {
printf("atomicAdd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
val -= 10;
}
if (val != gpuData[1]) {
printf("atomicSub failed\n");
return false;
}
bool found = false;
for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
for (int i = 0; i < len; ++i) {
val += 10;
}
}
if (!found) {
printf("atomicExch failed\n");
return false;
}
val = -(1 << 8);
for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
if (val != gpuData[0]) {
printf("atomicAdd failed\n");
return false;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0;
val = 0xff;
for (int i = 0; i < len; ++i) {
// 9th element should be 1
val &= (2 * i + 7);
}
for (int i = 0; i < len; ++i) {
val -= 10;
}
if (val != gpuData[8]) {
printf("atomicAnd failed\n");
return false;
}
if (val != gpuData[1]) {
printf("atomicSub failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
bool found = false;
if (val != gpuData[9]) {
printf("atomicOr failed\n");
return false;
}
for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
}
}
val = 0xff;
if (!found) {
printf("atomicExch failed\n");
return false;
}
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
val = -(1 << 8);
if (val != gpuData[10]) {
printf("atomicXor failed\n");
return false;
}
for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
return true;
if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 9th element should be 1
val &= (2 * i + 7);
}
if (val != gpuData[8]) {
printf("atomicAnd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
if (val != gpuData[9]) {
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != gpuData[10]) {
printf("atomicXor failed\n");
return false;
}
return true;
}

View File

@ -36,45 +36,46 @@
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void testKernel(int *g_odata) {
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
extern "C" __global__ void testKernel(int *g_odata)
{
// access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// Test various atomic instructions
// Arithmetic atomic instructions
// Atomic addition
atomicAdd(&g_odata[0], 10);
// Test various atomic instructions
// Arithmetic atomic instructions
// Atomic addition
atomicAdd(&g_odata[0], 10);
// Atomic subtraction (final should be 0)
atomicSub(&g_odata[1], 10);
// Atomic subtraction (final should be 0)
atomicSub(&g_odata[1], 10);
// Atomic exchange
atomicExch(&g_odata[2], tid);
// Atomic exchange
atomicExch(&g_odata[2], tid);
// Atomic maximum
atomicMax(&g_odata[3], tid);
// Atomic maximum
atomicMax(&g_odata[3], tid);
// Atomic minimum
atomicMin(&g_odata[4], tid);
// Atomic minimum
atomicMin(&g_odata[4], tid);
// Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137);
// Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137);
// Atomic compare-and-swap
atomicCAS(&g_odata[7], tid - 1, tid);
// Atomic compare-and-swap
atomicCAS(&g_odata[7], tid - 1, tid);
// Bitwise atomic instructions
// Atomic AND
atomicAnd(&g_odata[8], 2 * tid + 7);
// Bitwise atomic instructions
// Atomic AND
atomicAnd(&g_odata[8], 2 * tid + 7);
// Atomic OR
atomicOr(&g_odata[9], 1 << tid);
// Atomic OR
atomicOr(&g_odata[9], 1 << tid);
// Atomic XOR
atomicXor(&g_odata[10], tid);
// Atomic XOR
atomicXor(&g_odata[10], tid);
}
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSynchronize, cudaStreamSetAttr
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -26,30 +26,31 @@
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples
#include <helper_functions.h> // helper functions for SDK examples
////////////////////////////////////////////////////////////////////////////////
// declaration, forward
void runTest(int argc, char **argv);
cudaAccessPolicyWindow initAccessPolicyWindow(void) {
cudaAccessPolicyWindow accessPolicyWindow = {0};
accessPolicyWindow.base_ptr = (void *)0;
accessPolicyWindow.num_bytes = 0;
accessPolicyWindow.hitRatio = 0.f;
accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
return accessPolicyWindow;
cudaAccessPolicyWindow initAccessPolicyWindow(void)
{
cudaAccessPolicyWindow accessPolicyWindow = {0};
accessPolicyWindow.base_ptr = (void *)0;
accessPolicyWindow.num_bytes = 0;
accessPolicyWindow.hitRatio = 0.f;
accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
return accessPolicyWindow;
}
////////////////////////////////////////////////////////////////////////////////
@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
//! @param bigDataSize input bigData size
//! @param hitcount how many data access are done within block
////////////////////////////////////////////////////////////////////////////////
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
int bigDataSize, int hitCount) {
__shared__ unsigned int hit;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tID = row * blockDim.y + col;
uint32_t psRand = tID;
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
{
__shared__ unsigned int hit;
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tID = row * blockDim.y + col;
uint32_t psRand = tID;
atomicExch(&hit, 0);
__syncthreads();
while (hit < hitCount) {
psRand ^= psRand << 13;
psRand ^= psRand >> 17;
psRand ^= psRand << 5;
atomicExch(&hit, 0);
__syncthreads();
while (hit < hitCount) {
psRand ^= psRand << 13;
psRand ^= psRand >> 17;
psRand ^= psRand << 5;
int idx = tID - psRand;
if (idx < 0) {
idx = -idx;
int idx = tID - psRand;
if (idx < 0) {
idx = -idx;
}
if ((tID % 2) == 0) {
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
}
else {
trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
}
atomicAdd(&hit, 1);
}
if ((tID % 2) == 0) {
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
} else {
trash[psRand % bigDataSize] =
trash[psRand % bigDataSize] + trash[idx % bigDataSize];
}
atomicAdd(&hit, 1);
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); }
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
bool bTestResult = true;
cudaAccessPolicyWindow accessPolicyWindow;
cudaDeviceProp deviceProp;
cudaStreamAttrValue streamAttrValue;
cudaStream_t stream;
cudaStreamAttrID streamAttrID;
dim3 threads(32, 32);
int *dataDevicePointer;
int *dataHostPointer;
int dataSize;
int *bigDataDevicePointer;
int *bigDataHostPointer;
int bigDataSize;
StopWatchInterface *timer = 0;
void runTest(int argc, char **argv)
{
bool bTestResult = true;
cudaAccessPolicyWindow accessPolicyWindow;
cudaDeviceProp deviceProp;
cudaStreamAttrValue streamAttrValue;
cudaStream_t stream;
cudaStreamAttrID streamAttrID;
dim3 threads(32, 32);
int *dataDevicePointer;
int *dataHostPointer;
int dataSize;
int *bigDataDevicePointer;
int *bigDataHostPointer;
int bigDataSize;
StopWatchInterface *timer = 0;
printf("%s Starting...\n\n", argv[0]);
printf("%s Starting...\n\n", argv[0]);
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// Get device properties
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
dim3 blocks(deviceProp.maxGridSize[1], 1);
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// Get device properties
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
dim3 blocks(deviceProp.maxGridSize[1], 1);
// Make sure device the l2 optimization
if (deviceProp.persistingL2CacheMaxSize == 0) {
printf(
"Waiving execution as device %d does not support persisting L2 "
"Caching\n",
devID);
exit(EXIT_WAIVED);
}
// Create stream to assiocate with window
checkCudaErrors(cudaStreamCreate(&stream));
// Set the amount of l2 cache that will be persisting to maximum the device
// can support
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
deviceProp.persistingL2CacheMaxSize));
// Stream attribute to set
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
// Default window
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
accessPolicyWindow = initAccessPolicyWindow();
// Allocate size of both buffers
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
// Allocate data
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
checkCudaErrors(
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
for (int i = 0; i < bigDataSize; ++i) {
if (i < dataSize) {
dataHostPointer[i] = i;
// Make sure device the l2 optimization
if (deviceProp.persistingL2CacheMaxSize == 0) {
printf("Waiving execution as device %d does not support persisting L2 "
"Caching\n",
devID);
exit(EXIT_WAIVED);
}
bigDataHostPointer[bigDataSize - i - 1] = i;
}
// Create stream to assiocate with window
checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors(
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
checkCudaErrors(
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
dataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
bigDataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
// Set the amount of l2 cache that will be persisting to maximum the device
// can support
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
// Make a window for the buffer of interest
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
accessPolicyWindow.hitRatio = 1.f;
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
// Stream attribute to set
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
// Assign window to stream
checkCudaErrors(
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
// Default window
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
accessPolicyWindow = initAccessPolicyWindow();
// Demote any previous persisting lines
checkCudaErrors(cudaCtxResetPersistingL2Cache());
// Allocate size of both buffers
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
checkCudaErrors(cudaStreamSynchronize(stream));
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
// Allocate data
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
checkCudaErrors(cudaStreamSynchronize(stream));
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
for (int i = 0; i < bigDataSize; ++i) {
if (i < dataSize) {
dataHostPointer[i] = i;
}
// Free memory
checkCudaErrors(cudaFreeHost(dataHostPointer));
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
checkCudaErrors(cudaFree(dataDevicePointer));
checkCudaErrors(cudaFree(bigDataDevicePointer));
bigDataHostPointer[bigDataSize - i - 1] = i;
}
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
checkCudaErrors(
cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(
bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
// Make a window for the buffer of interest
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
accessPolicyWindow.hitRatio = 1.f;
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
// Assign window to stream
checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
// Demote any previous persisting lines
checkCudaErrors(cudaCtxResetPersistingL2Cache());
checkCudaErrors(cudaStreamSynchronize(stream));
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
checkCudaErrors(cudaStreamSynchronize(stream));
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
// Free memory
checkCudaErrors(cudaFreeHost(dataHostPointer));
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
checkCudaErrors(cudaFree(dataDevicePointer));
checkCudaErrors(cudaFree(bigDataDevicePointer));
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
@ -59,12 +61,16 @@ if(${OpenGL_FOUND})
add_custom_command(TARGET simpleCUDA2GL
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E
copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
)
add_custom_command(TARGET simpleCUDA2GL
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll ${CMAKE_CURRENT_BINARY_DIR}
COMMAND ${CMAKE_COMMAND} -E
copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
)
endif()
endif()

View File

@ -30,8 +30,7 @@ cudaHostAlloc, cudaGraphicsUnmapResources, cudaMalloc, cudaFree, cudaGraphicsRes
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details)

File diff suppressed because it is too large Load Diff

View File

@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
// convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b) {
r = clamp(r, 0.0f, 255.0f);
g = clamp(g, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f);
return (int(b) << 16) | (int(g) << 8) | int(r);
__device__ int rgbToInt(float r, float g, float b)
{
r = clamp(r, 0.0f, 255.0f);
g = clamp(g, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f);
return (int(b) << 16) | (int(g) << 8) | int(r);
}
__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
extern __shared__ uchar4 sdata[];
__global__ void cudaProcess(unsigned int *g_odata, int imgw)
{
extern __shared__ uchar4 sdata[];
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x * bw + tx;
int y = blockIdx.y * bh + ty;
int tx = threadIdx.x;
int ty = threadIdx.y;
int bw = blockDim.x;
int bh = blockDim.y;
int x = blockIdx.x * bw + tx;
int y = blockIdx.y * bh + ty;
uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
}
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
unsigned int *g_odata, int imgw) {
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
{
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaHostAlloc, cudaStreamDestroy, cudaFree, cudaSetDevice, cudaGetDeviceCount, c
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -29,115 +29,124 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
{
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
}
// Wait for thread to finish
void cutEndThread(CUTThread thread) {
WaitForSingleObject(thread, INFINITE);
CloseHandle(thread);
void cutEndThread(CUTThread thread)
{
WaitForSingleObject(thread, INFINITE);
CloseHandle(thread);
}
// Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) {
WaitForMultipleObjects(num, threads, true, INFINITE);
void cutWaitForThreads(const CUTThread *threads, int num)
{
WaitForMultipleObjects(num, threads, true, INFINITE);
for (int i = 0; i < num; i++) {
CloseHandle(threads[i]);
}
for (int i = 0; i < num; i++) {
CloseHandle(threads[i]);
}
}
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) {
CUTBarrier barrier;
CUTBarrier cutCreateBarrier(int releaseCount)
{
CUTBarrier barrier;
InitializeCriticalSection(&barrier.criticalSection);
barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
barrier.count = 0;
barrier.releaseCount = releaseCount;
InitializeCriticalSection(&barrier.criticalSection);
barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
barrier.count = 0;
barrier.releaseCount = releaseCount;
return barrier;
return barrier;
}
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) {
int myBarrierCount;
EnterCriticalSection(&barrier->criticalSection);
myBarrierCount = ++barrier->count;
LeaveCriticalSection(&barrier->criticalSection);
void cutIncrementBarrier(CUTBarrier *barrier)
{
int myBarrierCount;
EnterCriticalSection(&barrier->criticalSection);
myBarrierCount = ++barrier->count;
LeaveCriticalSection(&barrier->criticalSection);
if (myBarrierCount >= barrier->releaseCount) {
SetEvent(barrier->barrierEvent);
}
if (myBarrierCount >= barrier->releaseCount) {
SetEvent(barrier->barrierEvent);
}
}
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) {
WaitForSingleObject(barrier->barrierEvent, INFINITE);
}
void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) {}
#else
// Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
pthread_t thread;
pthread_create(&thread, NULL, func, data);
return thread;
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
{
pthread_t thread;
pthread_create(&thread, NULL, func, data);
return thread;
}
// Wait for thread to finish
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
// Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) {
for (int i = 0; i < num; i++) {
cutEndThread(threads[i]);
}
void cutWaitForThreads(const CUTThread *threads, int num)
{
for (int i = 0; i < num; i++) {
cutEndThread(threads[i]);
}
}
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) {
CUTBarrier barrier;
CUTBarrier cutCreateBarrier(int releaseCount)
{
CUTBarrier barrier;
barrier.count = 0;
barrier.releaseCount = releaseCount;
barrier.count = 0;
barrier.releaseCount = releaseCount;
pthread_mutex_init(&barrier.mutex, 0);
pthread_cond_init(&barrier.conditionVariable, 0);
pthread_mutex_init(&barrier.mutex, 0);
pthread_cond_init(&barrier.conditionVariable, 0);
return barrier;
return barrier;
}
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) {
int myBarrierCount;
pthread_mutex_lock(&barrier->mutex);
myBarrierCount = ++barrier->count;
pthread_mutex_unlock(&barrier->mutex);
void cutIncrementBarrier(CUTBarrier *barrier)
{
int myBarrierCount;
pthread_mutex_lock(&barrier->mutex);
myBarrierCount = ++barrier->count;
pthread_mutex_unlock(&barrier->mutex);
if (myBarrierCount >= barrier->releaseCount) {
pthread_cond_signal(&barrier->conditionVariable);
}
if (myBarrierCount >= barrier->releaseCount) {
pthread_cond_signal(&barrier->conditionVariable);
}
}
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) {
pthread_mutex_lock(&barrier->mutex);
void cutWaitForBarrier(CUTBarrier *barrier)
{
pthread_mutex_lock(&barrier->mutex);
while (barrier->count < barrier->releaseCount) {
pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
}
while (barrier->count < barrier->releaseCount) {
pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
}
pthread_mutex_unlock(&barrier->mutex);
pthread_mutex_unlock(&barrier->mutex);
}
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) {
pthread_mutex_destroy(&barrier->mutex);
pthread_cond_destroy(&barrier->conditionVariable);
void cutDestroyBarrier(CUTBarrier *barrier)
{
pthread_mutex_destroy(&barrier->mutex);
pthread_cond_destroy(&barrier->conditionVariable);
}
#endif

View File

@ -37,15 +37,16 @@
typedef HANDLE CUTThread;
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
struct CUTBarrier {
CRITICAL_SECTION criticalSection;
HANDLE barrierEvent;
int releaseCount;
int count;
struct CUTBarrier
{
CRITICAL_SECTION criticalSection;
HANDLE barrierEvent;
int releaseCount;
int count;
};
#define CUT_THREADPROC unsigned WINAPI
#define CUT_THREADEND return 0
#define CUT_THREADEND return 0
#else
// POSIX threads.
@ -55,44 +56,46 @@ typedef pthread_t CUTThread;
typedef void *(*CUT_THREADROUTINE)(void *);
#define CUT_THREADPROC void *
#define CUT_THREADEND return 0
#define CUT_THREADEND return 0
struct CUTBarrier {
pthread_mutex_t mutex;
pthread_cond_t conditionVariable;
int releaseCount;
int count;
struct CUTBarrier
{
pthread_mutex_t mutex;
pthread_cond_t conditionVariable;
int releaseCount;
int count;
};
#endif
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
// Create thread.
CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
// Create thread.
CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
// Wait for thread to finish.
void cutEndThread(CUTThread thread);
// Wait for thread to finish.
void cutEndThread(CUTThread thread);
// Wait for multiple threads.
void cutWaitForThreads(const CUTThread *threads, int num);
// Wait for multiple threads.
void cutWaitForThreads(const CUTThread *threads, int num);
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount);
// Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount);
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier);
// Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier);
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier);
// Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier);
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier);
// Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier);
#ifdef __cplusplus
} // extern "C"
} // extern "C"
#endif
#endif // MULTITHREADING_H
#endif // MULTITHREADING_H

View File

@ -43,172 +43,173 @@
#include <stdio.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include "multithreading.h"
const int N_workloads = 8;
const int N_workloads = 8;
const int N_elements_per_workload = 100000;
CUTBarrier thread_barrier;
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
void *data);
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
struct heterogeneous_workload {
int id;
int cudaDeviceID;
struct heterogeneous_workload
{
int id;
int cudaDeviceID;
int *h_data;
int *d_data;
cudaStream_t stream;
int *h_data;
int *d_data;
cudaStream_t stream;
bool success;
bool success;
};
__global__ void incKernel(int *data, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
__global__ void incKernel(int *data, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) data[i]++;
if (i < N)
data[i]++;
}
CUT_THREADPROC launch(void *void_arg) {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
CUT_THREADPROC launch(void *void_arg)
{
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// Allocate Resources
checkCudaErrors(cudaStreamCreate(&workload->stream));
checkCudaErrors(
cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
checkCudaErrors(cudaHostAlloc(&workload->h_data,
N_elements_per_workload * sizeof(int),
cudaHostAllocPortable));
// Allocate Resources
checkCudaErrors(cudaStreamCreate(&workload->stream));
checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
// CPU thread generates data
for (int i = 0; i < N_elements_per_workload; ++i) {
workload->h_data[i] = workload->id + i;
}
// Schedule work for GPU in CUDA stream without blocking the CPU thread
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
dim3 block(512);
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyHostToDevice, workload->stream));
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
N_elements_per_workload);
checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyDeviceToHost, workload->stream));
// New in CUDA 5.0: Add a CPU callback which is called once all currently
// pending operations in the CUDA stream have finished
checkCudaErrors(
cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
CUT_THREADEND;
// CPU thread end of life, GPU continues to process data...
}
CUT_THREADPROC postprocess(void *void_arg) {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// ... GPU is done with processing, continue on new CPU thread...
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// CPU thread consumes results from GPU
workload->success = true;
for (int i = 0; i < N_workloads; ++i) {
workload->success &= workload->h_data[i] == i + workload->id + 1;
}
// Free Resources
checkCudaErrors(cudaFree(workload->d_data));
checkCudaErrors(cudaFreeHost(workload->h_data));
checkCudaErrors(cudaStreamDestroy(workload->stream));
// Signal the end of the heterogeneous workload to main thread
cutIncrementBarrier(&thread_barrier);
CUT_THREADEND;
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
void *data) {
// Check status of GPU after stream operations are done
checkCudaErrors(status);
// Spawn new CPU worker thread and continue processing on the CPU
cutStartThread(postprocess, data);
}
int main(int argc, char **argv) {
int N_gpus, max_gpus = 0;
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
printf("Starting simpleCallback\n");
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
printf("Found %d CUDA capable GPUs\n", N_gpus);
if (N_gpus > 32) {
printf("simpleCallback only supports 32 GPU(s)\n");
}
for (int devid = 0; devid < N_gpus; devid++) {
int SMversion;
cudaDeviceProp deviceProp;
cudaSetDevice(devid);
cudaGetDeviceProperties(&deviceProp, devid);
SMversion = deviceProp.major << 4 + deviceProp.minor;
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
deviceProp.major, deviceProp.minor);
printf(", %s GPU Callback Functions\n",
(SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) {
gpuInfo[max_gpus++] = devid;
// CPU thread generates data
for (int i = 0; i < N_elements_per_workload; ++i) {
workload->h_data[i] = workload->id + i;
}
}
printf("%d GPUs available to run Callback Functions\n", max_gpus);
// Schedule work for GPU in CUDA stream without blocking the CPU thread
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
dim3 block(512);
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
heterogeneous_workload *workloads;
workloads = (heterogeneous_workload *)malloc(N_workloads *
sizeof(heterogeneous_workload));
;
thread_barrier = cutCreateBarrier(N_workloads);
checkCudaErrors(cudaMemcpyAsync(workload->d_data,
workload->h_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyHostToDevice,
workload->stream));
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
checkCudaErrors(cudaMemcpyAsync(workload->h_data,
workload->d_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyDeviceToHost,
workload->stream));
// Main thread spawns a CPU worker thread for each heterogeneous workload
printf("Starting %d heterogeneous computing workloads\n", N_workloads);
// New in CUDA 5.0: Add a CPU callback which is called once all currently
// pending operations in the CUDA stream have finished
checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
for (int i = 0; i < N_workloads; ++i) {
workloads[i].id = i;
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
cutStartThread(launch, &workloads[i]);
}
// Sleep until all workloads have finished
cutWaitForBarrier(&thread_barrier);
printf("Total of %d workloads finished:\n", N_workloads);
bool success = true;
for (int i = 0; i < N_workloads; ++i) {
success &= workloads[i].success;
}
printf("%s\n", success ? "Success" : "Failure");
free(workloads);
exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
CUT_THREADEND;
// CPU thread end of life, GPU continues to process data...
}
CUT_THREADPROC postprocess(void *void_arg)
{
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// ... GPU is done with processing, continue on new CPU thread...
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// CPU thread consumes results from GPU
workload->success = true;
for (int i = 0; i < N_workloads; ++i) {
workload->success &= workload->h_data[i] == i + workload->id + 1;
}
// Free Resources
checkCudaErrors(cudaFree(workload->d_data));
checkCudaErrors(cudaFreeHost(workload->h_data));
checkCudaErrors(cudaStreamDestroy(workload->stream));
// Signal the end of the heterogeneous workload to main thread
cutIncrementBarrier(&thread_barrier);
CUT_THREADEND;
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
{
// Check status of GPU after stream operations are done
checkCudaErrors(status);
// Spawn new CPU worker thread and continue processing on the CPU
cutStartThread(postprocess, data);
}
int main(int argc, char **argv)
{
int N_gpus, max_gpus = 0;
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
printf("Starting simpleCallback\n");
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
printf("Found %d CUDA capable GPUs\n", N_gpus);
if (N_gpus > 32) {
printf("simpleCallback only supports 32 GPU(s)\n");
}
for (int devid = 0; devid < N_gpus; devid++) {
int SMversion;
cudaDeviceProp deviceProp;
cudaSetDevice(devid);
cudaGetDeviceProperties(&deviceProp, devid);
SMversion = deviceProp.major << 4 + deviceProp.minor;
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) {
gpuInfo[max_gpus++] = devid;
}
}
printf("%d GPUs available to run Callback Functions\n", max_gpus);
heterogeneous_workload *workloads;
workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
;
thread_barrier = cutCreateBarrier(N_workloads);
// Main thread spawns a CPU worker thread for each heterogeneous workload
printf("Starting %d heterogeneous computing workloads\n", N_workloads);
for (int i = 0; i < N_workloads; ++i) {
workloads[i].id = i;
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
cutStartThread(launch, &workloads[i]);
}
// Sleep until all workloads have finished
cutWaitForBarrier(&thread_barrier);
printf("Total of %d workloads finished:\n", N_workloads);
bool success = true;
for (int i = 0; i < N_workloads; ++i) {
success &= workloads[i].success;
}
printf("%s\n", success ? "Success" : "Failure");
free(workloads);
exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaDeviceSynchronize, cudaGetErrorString
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -38,8 +38,8 @@
*
*/
#include <stdio.h>
#include <cooperative_groups.h>
#include <stdio.h>
using namespace cooperative_groups;
@ -49,35 +49,36 @@ using namespace cooperative_groups;
* calculates the sum of val across the group g. The workspace array, x,
* must be large enough to contain g.size() integers.
*/
__device__ int sumReduction(thread_group g, int *x, int val) {
// rank of this thread in the group
int lane = g.thread_rank();
__device__ int sumReduction(thread_group g, int *x, int val)
{
// rank of this thread in the group
int lane = g.thread_rank();
// for each iteration of this loop, the number of threads active in the
// reduction, i, is halved, and each active thread (with index [lane])
// performs a single summation of it's own value with that
// of a "partner" (with index [lane+i]).
for (int i = g.size() / 2; i > 0; i /= 2) {
// store value for this thread in temporary array
x[lane] = val;
// for each iteration of this loop, the number of threads active in the
// reduction, i, is halved, and each active thread (with index [lane])
// performs a single summation of it's own value with that
// of a "partner" (with index [lane+i]).
for (int i = g.size() / 2; i > 0; i /= 2) {
// store value for this thread in temporary array
x[lane] = val;
// synchronize all threads in group
g.sync();
// synchronize all threads in group
g.sync();
if (lane < i)
// active threads perform summation of their value with
// their partner's value
val += x[lane + i];
if (lane < i)
// active threads perform summation of their value with
// their partner's value
val += x[lane + i];
// synchronize all threads in group
g.sync();
}
// synchronize all threads in group
g.sync();
}
// master thread in group returns result, and others return -1.
if (g.thread_rank() == 0)
return val;
else
return -1;
// master thread in group returns result, and others return -1.
if (g.thread_rank() == 0)
return val;
else
return -1;
}
/**
@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
*
* Creates cooperative groups and performs reductions
*/
__global__ void cgkernel() {
// threadBlockGroup includes all threads in the block
thread_block threadBlockGroup = this_thread_block();
int threadBlockGroupSize = threadBlockGroup.size();
__global__ void cgkernel()
{
// threadBlockGroup includes all threads in the block
thread_block threadBlockGroup = this_thread_block();
int threadBlockGroupSize = threadBlockGroup.size();
// workspace array in shared memory required for reduction
extern __shared__ int workspace[];
// workspace array in shared memory required for reduction
extern __shared__ int workspace[];
int input, output, expectedOutput;
int input, output, expectedOutput;
// input to reduction, for each thread, is its' rank in the group
input = threadBlockGroup.thread_rank();
// input to reduction, for each thread, is its' rank in the group
input = threadBlockGroup.thread_rank();
// expected output from analytical formula (n-1)(n)/2
// (noting that indexing starts at 0 rather than 1)
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
// expected output from analytical formula (n-1)(n)/2
// (noting that indexing starts at 0 rather than 1)
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
// perform reduction
output = sumReduction(threadBlockGroup, workspace, input);
// perform reduction
output = sumReduction(threadBlockGroup, workspace, input);
// master thread in group prints out result
if (threadBlockGroup.thread_rank() == 0) {
printf(
" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
(int)threadBlockGroup.size() - 1, output, expectedOutput);
// master thread in group prints out result
if (threadBlockGroup.thread_rank() == 0) {
printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
(int)threadBlockGroup.size() - 1,
output,
expectedOutput);
printf(" Now creating %d groups, each of size 16 threads:\n\n",
(int)threadBlockGroup.size() / 16);
}
printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
}
threadBlockGroup.sync();
threadBlockGroup.sync();
// each tiledPartition16 group includes 16 threads
thread_block_tile<16> tiledPartition16 =
tiled_partition<16>(threadBlockGroup);
// each tiledPartition16 group includes 16 threads
thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
// This offset allows each group to have its own unique area in the workspace
// array
int workspaceOffset =
threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
// This offset allows each group to have its own unique area in the workspace
// array
int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
// input to reduction, for each thread, is its' rank in the group
input = tiledPartition16.thread_rank();
// input to reduction, for each thread, is its' rank in the group
input = tiledPartition16.thread_rank();
// expected output from analytical formula (n-1)(n)/2
// (noting that indexing starts at 0 rather than 1)
expectedOutput = 15 * 16 / 2;
// expected output from analytical formula (n-1)(n)/2
// (noting that indexing starts at 0 rather than 1)
expectedOutput = 15 * 16 / 2;
// Perform reduction
output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
// Perform reduction
output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
// each master thread prints out result
if (tiledPartition16.thread_rank() == 0)
printf(
" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
"(expected %d)\n",
output, expectedOutput);
// each master thread prints out result
if (tiledPartition16.thread_rank() == 0)
printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
"(expected %d)\n",
output,
expectedOutput);
return;
return;
}
/**
* Host main routine
*/
int main() {
// Error code to check return values for CUDA calls
cudaError_t err;
int main()
{
// Error code to check return values for CUDA calls
cudaError_t err;
// Launch the kernel
// Launch the kernel
int blocksPerGrid = 1;
int threadsPerBlock = 64;
int blocksPerGrid = 1;
int threadsPerBlock = 64;
printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
// we use the optional third argument to specify the size
// of shared memory required in the kernel
cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
err = cudaDeviceSynchronize();
// we use the optional third argument to specify the size
// of shared memory required in the kernel
cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
printf("\n...Done.\n\n");
printf("\n...Done.\n\n");
return 0;
return 0;
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries

View File

@ -27,6 +27,6 @@ cudaMemcpy, cudaCreateChannelDesc, cudaFreeArray, cudaFree, cudaPitchedPtr, cuda
## Prerequisites
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details)

View File

@ -26,27 +26,27 @@
*/
/*
* This sample demonstrates how to use texture fetches from layered 2D textures
* in CUDA C
*
* This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array.
*/
* This sample demonstrates how to use texture fetches from layered 2D textures
* in CUDA C
*
* This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array.
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA
#include <cuda_runtime.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_functions.h>
static const char *sSDKname = "simpleCubemapTexture";
@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture";
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
//! @param g_odata output data in global memory
////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *g_odata, int width,
cudaTextureObject_t tex) {
// calculate this thread's data point
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
{
// calculate this thread's data point
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D
// 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D
float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
float cx, cy, cz;
float cx, cy, cz;
for (unsigned int face = 0; face < 6; face++) {
// Layer 0 is positive X face
if (face == 0) {
cx = 1;
cy = -v;
cz = -u;
}
// Layer 1 is negative X face
else if (face == 1) {
cx = -1;
cy = -v;
cz = u;
}
// Layer 2 is positive Y face
else if (face == 2) {
cx = u;
cy = 1;
cz = v;
}
// Layer 3 is negative Y face
else if (face == 3) {
cx = u;
cy = -1;
cz = -v;
}
// Layer 4 is positive Z face
else if (face == 4) {
cx = u;
cy = -v;
cz = 1;
}
// Layer 4 is negative Z face
else if (face == 5) {
cx = -u;
cy = -v;
cz = -1;
}
for (unsigned int face = 0; face < 6; face++) {
// Layer 0 is positive X face
if (face == 0) {
cx = 1;
cy = -v;
cz = -u;
}
// Layer 1 is negative X face
else if (face == 1) {
cx = -1;
cy = -v;
cz = u;
}
// Layer 2 is positive Y face
else if (face == 2) {
cx = u;
cy = 1;
cz = v;
}
// Layer 3 is negative Y face
else if (face == 3) {
cx = u;
cy = -1;
cz = -v;
}
// Layer 4 is positive Z face
else if (face == 4) {
cx = u;
cy = -v;
cz = 1;
}
// Layer 4 is negative Z face
else if (face == 5) {
cx = -u;
cy = -v;
cz = -1;
}
// read from texture, do expected transformation and write to global memory
g_odata[face * width * width + y * width + x] =
-texCubemap<float>(tex, cx, cy, cz);
}
// read from texture, do expected transformation and write to global memory
g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
int main(int argc, char **argv)
{
// use command-line specified CUDA device, otherwise use device with highest
// Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
bool bResult = true;
bool bResult = true;
// get number of SMs on this GPU
cudaDeviceProp deviceProps;
// get number of SMs on this GPU
cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
if (deviceProps.major < 2) {
printf(
"%s requires SM 2.0 or higher for support of Texture Arrays. Test "
"will exit... \n",
sSDKname);
if (deviceProps.major < 2) {
printf("%s requires SM 2.0 or higher for support of Texture Arrays. Test "
"will exit... \n",
sSDKname);
exit(EXIT_WAIVED);
}
// generate input data for layered texture
unsigned int width = 64, num_faces = 6, num_layers = 1;
unsigned int cubemap_size = width * width * num_faces;
unsigned int size = cubemap_size * num_layers * sizeof(float);
float *h_data = (float *)malloc(size);
for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
h_data[i] = (float)i;
}
// this is the expected transformation of the input data (the expected output)
float *h_data_ref = (float *)malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++) {
for (int i = 0; i < (int)(cubemap_size); i++) {
h_data_ref[layer * cubemap_size + i] =
-h_data[layer * cubemap_size + i] + layer;
exit(EXIT_WAIVED);
}
}
// allocate device memory for result
float *d_data = NULL;
checkCudaErrors(cudaMalloc((void **)&d_data, size));
// generate input data for layered texture
unsigned int width = 64, num_faces = 6, num_layers = 1;
unsigned int cubemap_size = width * width * num_faces;
unsigned int size = cubemap_size * num_layers * sizeof(float);
float *h_data = (float *)malloc(size);
// allocate array and copy image data
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cu_3darray;
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
make_cudaExtent(width, width, num_faces),
cudaArrayCubemap));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr =
make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, width, num_faces);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
h_data[i] = (float)i;
}
cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
// this is the expected transformation of the input data (the expected output)
float *h_data_ref = (float *)malloc(size);
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cu_3darray;
for (unsigned int layer = 0; layer < num_layers; layer++) {
for (int i = 0; i < (int)(cubemap_size); i++) {
h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
}
}
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
// allocate device memory for result
float *d_data = NULL;
checkCudaErrors(cudaMalloc((void **)&d_data, size));
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.addressMode[2] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cu_3darray;
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
checkCudaErrors(
cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, width, num_faces);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cu_3darray;
printf(
"Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
"block has 8 x 8 threads\n",
width, num_layers, dimGrid.x, dimGrid.y);
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
tex); // warmup (for better timing)
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.addressMode[2] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
checkCudaErrors(cudaDeviceSynchronize());
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
"block has 8 x 8 threads\n",
width,
num_layers,
dimGrid.x,
dimGrid.y);
// execute the kernel
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
tex); // warmup (for better timing)
// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");
// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n",
(cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
// allocate mem for the result on host side
float *h_odata = (float *)malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
false);
} else {
printf("Comparing kernel output to expected data\n");
// execute the kernel
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side
float *h_odata = (float *)malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
// write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
}
else {
printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f
bResult =
compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
}
bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
}
// cleanup memory
free(h_data);
free(h_data_ref);
free(h_odata);
// cleanup memory
free(h_data);
free(h_data_ref);
free(h_odata);
checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray));
checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray));
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
if(ENABLE_CUDA_DEBUG)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
else()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
endif()
# Include directories and libraries
@ -40,6 +42,12 @@ target_link_libraries(simpleDrvRuntime PUBLIC
set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/vectorAdd_kernel64.fatbin")
set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/vectorAdd_kernel.cu")
# Construct GENCODE_FLAGS explicitly from CUDA architectures
set(GENCODE_FLAGS "")
foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
list(APPEND GENCODE_FLAGS "-gencode=arch=compute_${arch},code=sm_${arch}")
endforeach()
add_custom_command(
OUTPUT ${CUDA_FATBIN_FILE}
COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}

Some files were not shown because too many files have changed in this diff Show More