mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-07-02 04:33:13 +08:00
Compare commits
82 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
8a9e2c830c | ||
|
adacf1cffd | ||
|
da3b7a2b3c | ||
|
5987a9e9fa | ||
|
107f3f537f | ||
|
b530f1cf42 | ||
|
cab7c66b4f | ||
|
8d400cfb7f | ||
|
6d6d964f97 | ||
|
ab68d58d59 | ||
|
c70d79cf3b | ||
|
14b1bfdcc4 | ||
|
c14a0114d6 | ||
|
ee15cc0fe2 | ||
|
3438fd4875 | ||
|
b27b55ec70 | ||
|
49159f3739 | ||
|
1680a1dc7f | ||
|
49daf0e4e0 | ||
|
a45fd3bd7c | ||
|
0345908807 | ||
|
3b9c8ce2e9 | ||
|
e77d6eb5ab | ||
|
ac700327a2 | ||
|
17703dd426 | ||
|
a32d5badf7 | ||
|
1fd22429c3 | ||
|
00ac0a1673 | ||
|
b013387a39 | ||
|
9d921e0fe7 | ||
|
7d1730f348 | ||
|
718fe6486d | ||
|
ad9908e32b | ||
|
952d6edf92 | ||
|
685709bfc7 | ||
|
0c92c34ca9 | ||
|
0d82634f70 | ||
|
4abbdf4e80 | ||
|
914ca00f89 | ||
|
c8034f368a | ||
|
ceab6e8bcc | ||
|
2cd58fbc9a | ||
|
c0ab53f986 | ||
|
b87c243bbb | ||
|
e214cd29aa | ||
|
06d72496c2 | ||
|
2848d3bd21 | ||
|
bd0f630bf4 | ||
|
ab9166a6b2 | ||
|
c90a1c6981 | ||
|
9370f11e69 | ||
|
291435e0b4 | ||
|
8d901e745d | ||
|
990ebc01c2 | ||
|
9adce9d9f2 | ||
|
bcad2c9e61 | ||
|
e7b23470d5 | ||
|
310e7f2a11 | ||
|
7f0f63f311 | ||
|
acd3a015c8 | ||
|
a9869fd6ea | ||
|
3e8f91d1a1 | ||
|
f3b7c41ad6 | ||
|
29fb758e62 | ||
|
3bc08136ff | ||
|
85eefa06c4 | ||
|
c357dd1e6b | ||
|
efb46383e0 | ||
|
8d564d5e3a | ||
|
37c5bcbef4 | ||
|
940a4c7a91 | ||
|
61bd39800d | ||
|
8a96d2eee7 | ||
|
e762d58260 | ||
|
8fd1701744 | ||
|
94765c1597 | ||
|
c87881f02c | ||
|
25400b6b3c | ||
|
e24f62e28c | ||
|
22424227e7 | ||
|
42ff742bf5 | ||
|
8ccb13c6f0 |
49
.clang-format
Normal file
49
.clang-format
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
---
|
||||||
|
AccessModifierOffset: -4
|
||||||
|
AlignAfterOpenBracket: Align
|
||||||
|
AlignConsecutiveAssignments: Consecutive
|
||||||
|
AlignConsecutiveDeclarations: Consecutive
|
||||||
|
AlignConsecutiveMacros: Consecutive
|
||||||
|
AlignEscapedNewlines: Left
|
||||||
|
AlignOperands: AlignAfterOperator
|
||||||
|
AlignTrailingComments: true
|
||||||
|
AllowAllParametersOfDeclarationOnNextLine: false
|
||||||
|
BinPackArguments: false
|
||||||
|
BinPackParameters: false
|
||||||
|
BraceWrapping:
|
||||||
|
AfterClass: true
|
||||||
|
AfterControlStatement: false
|
||||||
|
AfterExternBlock: true
|
||||||
|
AfterFunction: true
|
||||||
|
AfterStruct: true
|
||||||
|
AfterUnion: true
|
||||||
|
BeforeCatch: true
|
||||||
|
BeforeElse: true
|
||||||
|
IndentBraces: false
|
||||||
|
BreakBeforeBraces: Custom
|
||||||
|
BreakBeforeConceptDeclarations: true
|
||||||
|
BreakBeforeBinaryOperators: NonAssignment
|
||||||
|
BreakBeforeTernaryOperators: true
|
||||||
|
BreakConstructorInitializers: BeforeComma
|
||||||
|
BreakInheritanceList: BeforeComma
|
||||||
|
ColumnLimit: 120
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
FixNamespaceComments: true
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^<.*>'
|
||||||
|
Priority: 1
|
||||||
|
- Regex: '^".*"'
|
||||||
|
Priority: 2
|
||||||
|
SortIncludes: true
|
||||||
|
IncludeBlocks: Regroup
|
||||||
|
IndentWidth: 4
|
||||||
|
MaxEmptyLinesToKeep: 2
|
||||||
|
PointerAlignment: Right
|
||||||
|
SortUsingDeclarations: true
|
||||||
|
SpaceAfterCStyleCast: false
|
||||||
|
SpaceBeforeAssignmentOperators: true
|
||||||
|
SpaceBeforeParens: ControlStatements
|
||||||
|
Standard: c++17
|
||||||
|
TabWidth: 4
|
||||||
|
UseTab: Never
|
||||||
|
...
|
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,3 +1,6 @@
|
|||||||
build
|
build
|
||||||
.vs
|
.vs
|
||||||
.clangd
|
.clangd
|
||||||
|
test
|
||||||
|
settings.json
|
||||||
|
launch.json
|
||||||
|
106
.pre-commit-config.yaml
Normal file
106
.pre-commit-config.yaml
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
# Copyright (c) 2024, NVIDIA CORPORATION.
|
||||||
|
ci:
|
||||||
|
autofix_commit_msg: |
|
||||||
|
[pre-commit.ci] auto code formatting
|
||||||
|
autofix_prs: false
|
||||||
|
autoupdate_branch: ''
|
||||||
|
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
|
||||||
|
autoupdate_schedule: quarterly
|
||||||
|
skip: []
|
||||||
|
submodules: false
|
||||||
|
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v5.0.0
|
||||||
|
hooks:
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.raw$|
|
||||||
|
.*\.bin$|
|
||||||
|
.*\.dat$|
|
||||||
|
.*\.nv12$|
|
||||||
|
data/.*|
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.txt$|
|
||||||
|
.*\.md$|
|
||||||
|
.*\.cpp$|
|
||||||
|
.*\.cxx$|
|
||||||
|
.*\.hpp$|
|
||||||
|
.*\.h$|
|
||||||
|
.*\.cu$|
|
||||||
|
.*\.cuh$|
|
||||||
|
.*\.py$|
|
||||||
|
.*\.json$
|
||||||
|
)
|
||||||
|
- id: mixed-line-ending
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.raw$|
|
||||||
|
.*\.bin$|
|
||||||
|
.*\.dat$|
|
||||||
|
.*\.nv12$|
|
||||||
|
data/.*|
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.txt$|
|
||||||
|
.*\.md$|
|
||||||
|
.*\.cpp$|
|
||||||
|
.*\.cxx$|
|
||||||
|
.*\.hpp$|
|
||||||
|
.*\.h$|
|
||||||
|
.*\.cu$|
|
||||||
|
.*\.cuh$|
|
||||||
|
.*\.py$|
|
||||||
|
.*\.json$
|
||||||
|
)
|
||||||
|
- id: trailing-whitespace
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.raw$|
|
||||||
|
.*\.bin$|
|
||||||
|
.*\.dat$|
|
||||||
|
.*\.nv12$|
|
||||||
|
data/.*|
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.txt$|
|
||||||
|
.*\.md$|
|
||||||
|
.*\.cpp$|
|
||||||
|
.*\.cxx$|
|
||||||
|
.*\.hpp$|
|
||||||
|
.*\.h$|
|
||||||
|
.*\.cu$|
|
||||||
|
.*\.cuh$|
|
||||||
|
.*\.py$|
|
||||||
|
.*\.json$
|
||||||
|
)
|
||||||
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
|
rev: v19.1.6
|
||||||
|
hooks:
|
||||||
|
- id: clang-format
|
||||||
|
types_or: [file]
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
^.*\.c$|
|
||||||
|
^.*\.cpp$|
|
||||||
|
^.*\.cu$|
|
||||||
|
^.*\.cuh$|
|
||||||
|
^.*\.cxx$|
|
||||||
|
^.*\.h$|
|
||||||
|
^.*\.hpp$|
|
||||||
|
^.*\.inl$|
|
||||||
|
^.*\.mm$
|
||||||
|
)
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
args: ["-fallback-style=none", "-style=file", "-i"]
|
11
CHANGELOG.md
11
CHANGELOG.md
@ -1,5 +1,15 @@
|
|||||||
## Changelog
|
## Changelog
|
||||||
|
|
||||||
|
### CUDA 12.9
|
||||||
|
* Updated toolchain for cross-compilation for Tegra Linux platforms.
|
||||||
|
* Added `run_tests.py` utility to exercise all samples. See README.md for details
|
||||||
|
* Repository has been updated with consistent code formatting across all samples
|
||||||
|
* Many small code tweaks and bug fixes (see commit history for details)
|
||||||
|
* Removed the following outdated samples:
|
||||||
|
* `1_Utilities`
|
||||||
|
* `bandwidthTest` - this sample was out of date and did not produce accurate results. For bandwidth
|
||||||
|
testing of NVIDIA GPU platforms, please refer to [NVBandwidth](https://github.com/NVIDIA/nvbandwidth)
|
||||||
|
|
||||||
### CUDA 12.8
|
### CUDA 12.8
|
||||||
* Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
|
* Updated build system across the repository to CMake. Removed Visual Studio project files and Makefiles.
|
||||||
* Removed the following outdated samples:
|
* Removed the following outdated samples:
|
||||||
@ -36,6 +46,7 @@
|
|||||||
* `cuDLALayerwiseStatsHybrid`
|
* `cuDLALayerwiseStatsHybrid`
|
||||||
* `cuDLALayerwiseStatsStandalone`
|
* `cuDLALayerwiseStatsStandalone`
|
||||||
* `cuDLAStandaloneMode`
|
* `cuDLAStandaloneMode`
|
||||||
|
* `cudaNvSciBufMultiplanar`
|
||||||
* `cudaNvSciNvMedia`
|
* `cudaNvSciNvMedia`
|
||||||
* `fluidsGLES`
|
* `fluidsGLES`
|
||||||
* `nbody_opengles`
|
* `nbody_opengles`
|
||||||
|
@ -16,8 +16,10 @@ set(CMAKE_CUDA_STANDARD_REQUIRED ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
|
||||||
|
103
CONTRIBUTING.md
Normal file
103
CONTRIBUTING.md
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
|
||||||
|
# Contributing to the CUDA Samples
|
||||||
|
|
||||||
|
Thank you for your interest in contributing to the CUDA Samples!
|
||||||
|
|
||||||
|
|
||||||
|
## Getting Started
|
||||||
|
|
||||||
|
1. **Fork & Clone the Repository**:
|
||||||
|
|
||||||
|
Fork the reporistory and clone the fork. For more information, check [GitHub's documentation on forking](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo) and [cloning a repository](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/cloning-a-repository).
|
||||||
|
|
||||||
|
## Making Changes
|
||||||
|
|
||||||
|
1. **Create a New Branch**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git checkout -b your-feature-branch
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Make Changes**.
|
||||||
|
|
||||||
|
3. **Build and Test**:
|
||||||
|
|
||||||
|
Ensure changes don't break existing functionality by building and running tests.
|
||||||
|
|
||||||
|
For more details on building and testing, refer to the [Building and Testing](#building-and-testing) section below.
|
||||||
|
|
||||||
|
4. **Commit Changes**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git commit -m "Brief description of the change"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building and Testing
|
||||||
|
|
||||||
|
For information on building a running tests on the samples, please refer to the main [README](README.md)
|
||||||
|
|
||||||
|
## Creating a Pull Request
|
||||||
|
|
||||||
|
1. Push changes to your fork
|
||||||
|
2. Create a pull request targeting the `master` branch of the original CUDA Samples repository. Refer to [GitHub's documentation](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) for more information on creating a pull request.
|
||||||
|
3. Describe the purpose and context of the changes in the pull request description.
|
||||||
|
|
||||||
|
## Code Formatting (pre-commit hooks)
|
||||||
|
|
||||||
|
The CUDA Samples repository uses [pre-commit](https://pre-commit.com/) to execute all code linters and formatters. These
|
||||||
|
tools ensure a consistent coding style throughout the project. Using pre-commit ensures that linter
|
||||||
|
versions and options are aligned for all developers. Additionally, there is a CI check in place to
|
||||||
|
enforce that committed code follows our standards.
|
||||||
|
|
||||||
|
The linters used by the CUDA Samples are listed in `.pre-commit-config.yaml`.
|
||||||
|
For example, C++ and CUDA code is formatted with [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html).
|
||||||
|
|
||||||
|
To use `pre-commit`, install via `conda` or `pip`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda config --add channels conda-forge
|
||||||
|
conda install pre-commit
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install pre-commit
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run pre-commit hooks before committing code:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pre-commit run
|
||||||
|
```
|
||||||
|
|
||||||
|
By default, pre-commit runs on staged files (only changes and additions that will be committed).
|
||||||
|
To run pre-commit checks on all files, execute:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pre-commit run --all-files
|
||||||
|
```
|
||||||
|
|
||||||
|
Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pre-commit install
|
||||||
|
```
|
||||||
|
|
||||||
|
Now code linters and formatters will be run each time you commit changes.
|
||||||
|
|
||||||
|
You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`, althoguh please note
|
||||||
|
that this may result in pull requests being rejected if subsequent checks fail.
|
||||||
|
|
||||||
|
## Review Process
|
||||||
|
|
||||||
|
Once submitted, maintainers will be automatically assigned to review the pull request. They might suggest changes or improvements. Constructive feedback is a part of the collaborative process, aimed at ensuring the highest quality code.
|
||||||
|
|
||||||
|
For constructive feedback and effective communication during reviews, we recommend following [Conventional Comments](https://conventionalcomments.org/).
|
||||||
|
|
||||||
|
Further recommended reading for successful PR reviews:
|
||||||
|
|
||||||
|
- [How to Do Code Reviews Like a Human (Part One)](https://mtlynch.io/human-code-reviews-1/)
|
||||||
|
- [How to Do Code Reviews Like a Human (Part Two)](https://mtlynch.io/human-code-reviews-2/)
|
||||||
|
|
||||||
|
## Thank You
|
||||||
|
|
||||||
|
Your contributions enhance the CUDA Samples for the entire community. We appreciate your effort and collaboration!
|
@ -241,7 +241,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
unsigned long long compute_perf =
|
unsigned long long compute_perf =
|
||||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
((unsigned long long)multiProcessorCount * sm_per_multiproc *
|
||||||
clockRate);
|
clockRate);
|
||||||
|
|
||||||
if (compute_perf > max_compute_perf) {
|
if (compute_perf > max_compute_perf) {
|
||||||
|
@ -258,7 +258,7 @@ namespace nv
|
|||||||
s[2] = &r3[0];
|
s[2] = &r3[0];
|
||||||
s[3] = &r4[0];
|
s[3] = &r4[0];
|
||||||
|
|
||||||
register int i,j,p,jj;
|
int i,j,p,jj;
|
||||||
|
|
||||||
for (i=0; i<4; i++)
|
for (i=0; i<4; i++)
|
||||||
{
|
{
|
||||||
|
189
README.md
189
README.md
@ -1,6 +1,6 @@
|
|||||||
# CUDA Samples
|
# CUDA Samples
|
||||||
|
|
||||||
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.6](https://developer.nvidia.com/cuda-downloads).
|
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
## Release Notes
|
## Release Notes
|
||||||
|
|
||||||
@ -14,7 +14,7 @@ This section describes the release notes for the CUDA Samples on GitHub only.
|
|||||||
|
|
||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
|
For system requirements and installation instructions of cuda toolkit, please refer to the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/), and the [Windows Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html).
|
||||||
|
|
||||||
### Getting the CUDA Samples
|
### Getting the CUDA Samples
|
||||||
@ -72,6 +72,17 @@ Open the generated solution file CUDA_Samples.sln in Visual Studio. Build the sa
|
|||||||
|
|
||||||
Run the samples from the output directories specified in Visual Studio.
|
Run the samples from the output directories specified in Visual Studio.
|
||||||
|
|
||||||
|
### Enabling On-GPU Debugging
|
||||||
|
|
||||||
|
NVIDIA GPUs support on-GPU debugging through cuda-gdb. Enabling this may significantly affect application performance as certain compiler optimizations are disabled
|
||||||
|
in this configuration, hence it's not on by default. Enablement of on-device debugging is controlled via the `-G` switch to nvcc.
|
||||||
|
|
||||||
|
To enable cuda-gdb for samples builds, define the `ENABLE_CUDA_DEBUG` flag on the CMake command line. For example:
|
||||||
|
|
||||||
|
```
|
||||||
|
cmake -DENABLE_CUDA_DEBUG=True ...
|
||||||
|
```
|
||||||
|
|
||||||
### Platform-Specific Samples
|
### Platform-Specific Samples
|
||||||
|
|
||||||
Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
|
Some CUDA samples are specific to certain platforms, and require passing flags into CMake to enable. In particular, we define the following platform-specific flags:
|
||||||
@ -94,9 +105,9 @@ Navigate to the root of the cloned repository and create a build directory:
|
|||||||
```
|
```
|
||||||
mkdir build && cd build
|
mkdir build && cd build
|
||||||
```
|
```
|
||||||
Configure the project with CMake, specifying the Tegra toolchain file:
|
Configure the project with CMake, specifying the Tegra toolchain file. And you can use -DTARGET_FS to point to the target file system root path for necessary include and library files:
|
||||||
```
|
```
|
||||||
cmake .. -DCMAKE_TOOLCHAIN_FILE=/path/to/tegra/toolchain.cmake
|
cmake .. -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/toolchain-aarch64-linux.cmake -DTARGET_FS=/path/to/target/system/file/system
|
||||||
```
|
```
|
||||||
Build the samples:
|
Build the samples:
|
||||||
```
|
```
|
||||||
@ -111,7 +122,7 @@ Instead of being in the default location, `/usr/local/cuda/include` or `/usr/loc
|
|||||||
|
|
||||||
`/usr/local/cuda/<ARCH>/targets/aarch64-linux/lib`
|
`/usr/local/cuda/<ARCH>/targets/aarch64-linux/lib`
|
||||||
and
|
and
|
||||||
`/usr/local/cuda-12.8/<ARCH>/include`
|
`/usr/local/cuda/<ARCH>/include`
|
||||||
|
|
||||||
An example build might look like this:
|
An example build might look like this:
|
||||||
|
|
||||||
@ -128,6 +139,168 @@ Note that in the current branch sample cross-compilation for QNX is not fully va
|
|||||||
near future with QNX cross-compilation instructions. In the meantime, if you want to cross-compile for QNX please check out one
|
near future with QNX cross-compilation instructions. In the meantime, if you want to cross-compile for QNX please check out one
|
||||||
of the previous tags prior to the CMake build system transition in 12.8.
|
of the previous tags prior to the CMake build system transition in 12.8.
|
||||||
|
|
||||||
|
## Running All Samples as Tests
|
||||||
|
|
||||||
|
It's important to note that the CUDA samples are _not_ intended as a validation suite for CUDA. They do not cover corner cases, they do not completely cover the
|
||||||
|
runtime and driver APIs, are not intended for performance benchmarking, etc. That said, it can sometimes be useful to run all of the samples as a quick sanity check and
|
||||||
|
we provide a script to do so, `run_tests.py`.
|
||||||
|
|
||||||
|
This Python3 script finds all executables in a subdirectory you choose, matching application names with command line arguments specified in `test_args.json`. It accepts
|
||||||
|
the following command line arguments:
|
||||||
|
|
||||||
|
| Switch | Purpose | Example |
|
||||||
|
| ---------- | -------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||||
|
| --dir | Specify the root directory to search for executables (recursively) | --dir ./build/Samples |
|
||||||
|
| --config | JSON configuration file for executable arguments | --config test_args.json |
|
||||||
|
| --output | Output directory for test results (stdout saved to .txt files - directory will be created if it doesn't exist) | --output ./test |
|
||||||
|
| --args | Global arguments to pass to all executables (not currently used) | --args arg_1 arg_2 ... |
|
||||||
|
| --parallel | Number of applications to execute in parallel. | --parallel 8 |
|
||||||
|
|
||||||
|
|
||||||
|
Application configurations are loaded from `test_args.json` and matched against executable names (discarding the `.exe` extension on Windows).
|
||||||
|
|
||||||
|
The script returns 0 on success, or the first non-zero error code encountered during testing on failure. It will also print a condensed list of samples that failed, if any.
|
||||||
|
|
||||||
|
There are three primary modes of configuration:
|
||||||
|
|
||||||
|
**Skip**
|
||||||
|
|
||||||
|
An executable configured with "skip" will not be executed. These generally rely on having attached graphical displays and are not suited to this kind of automation.
|
||||||
|
|
||||||
|
Configuration example:
|
||||||
|
```json
|
||||||
|
"fluidsGL": {
|
||||||
|
"skip": true
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You will see:
|
||||||
|
```
|
||||||
|
Skipping fluidsGL (marked as skip in config)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Single Run**
|
||||||
|
|
||||||
|
For executables to run one time only with arguments, specify each argument as a list entry. Each entry in the JSON file will be appended to the command line, separated
|
||||||
|
by a space.
|
||||||
|
|
||||||
|
All applications execute from their current directory, so all paths are relative to the application's location.
|
||||||
|
|
||||||
|
Note that if an application needs no arguments, this entry is optional. An executable found without a matching entry in the JSON will just run as `./application` from its
|
||||||
|
current directory.
|
||||||
|
|
||||||
|
Configuration example:
|
||||||
|
```json
|
||||||
|
"ptxgen": {
|
||||||
|
"args": [
|
||||||
|
"test.ll",
|
||||||
|
"-arch=compute_75"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You will see:
|
||||||
|
```
|
||||||
|
Running ptxgen
|
||||||
|
Command: ./ptxgen test.ll -arch=compute_75
|
||||||
|
Test completed with return code 0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Multiple Runs**
|
||||||
|
|
||||||
|
For executables to run multiple times with different command line arguments, specify any number of sets of args within a "runs" list.
|
||||||
|
|
||||||
|
As with single runs, all applications execute from their current directory, so all paths are relative to the application's location.
|
||||||
|
|
||||||
|
Configuration example:
|
||||||
|
```json
|
||||||
|
"recursiveGaussian": {
|
||||||
|
"runs": [
|
||||||
|
{
|
||||||
|
"args": [
|
||||||
|
"-sigma=10",
|
||||||
|
"-file=data/ref_10.ppm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"args": [
|
||||||
|
"-sigma=14",
|
||||||
|
"-file=data/ref_14.ppm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"args": [
|
||||||
|
"-sigma=18",
|
||||||
|
"-file=data/ref_18.ppm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"args": [
|
||||||
|
"-sigma=22",
|
||||||
|
"-file=data/ref_22.ppm"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You will see:
|
||||||
|
```
|
||||||
|
Running recursiveGaussian (run 1/4)
|
||||||
|
Command: ./recursiveGaussian -sigma=10 -file=data/ref_10.ppm
|
||||||
|
Test completed with return code 0
|
||||||
|
Running recursiveGaussian (run 2/4)
|
||||||
|
Command: ./recursiveGaussian -sigma=14 -file=data/ref_14.ppm
|
||||||
|
Test completed with return code 0
|
||||||
|
Running recursiveGaussian (run 3/4)
|
||||||
|
Command: ./recursiveGaussian -sigma=18 -file=data/ref_18.ppm
|
||||||
|
Test completed with return code 0
|
||||||
|
Running recursiveGaussian (run 4/4)
|
||||||
|
Command: ./recursiveGaussian -sigma=22 -file=data/ref_22.ppm
|
||||||
|
Test completed with return code 0
|
||||||
|
```
|
||||||
|
|
||||||
|
### Example Usage
|
||||||
|
|
||||||
|
Here is an example set of commands to build and test all of the samples.
|
||||||
|
|
||||||
|
First, build:
|
||||||
|
```bash
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
make -j$(nproc)
|
||||||
|
```
|
||||||
|
|
||||||
|
Now, return to the samples root directory and run the test script:
|
||||||
|
```bash
|
||||||
|
cd ..
|
||||||
|
python3 run_tests.py --output ./test --dir ./build/Samples --config test_args.json
|
||||||
|
```
|
||||||
|
|
||||||
|
If all applications run successfully, you will see something similar to this (the specific number of samples will depend on your build type
|
||||||
|
and system configuration):
|
||||||
|
|
||||||
|
```
|
||||||
|
Test Summary:
|
||||||
|
Ran 199 test runs for 180 executables.
|
||||||
|
All test runs passed!
|
||||||
|
```
|
||||||
|
|
||||||
|
If some samples fail, you will see something like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
Test Summary:
|
||||||
|
Ran 199 test runs for 180 executables.
|
||||||
|
Failed runs (2):
|
||||||
|
bicubicTexture (run 1/5): Failed (code 1)
|
||||||
|
Mandelbrot (run 1/2): Failed (code 1)
|
||||||
|
```
|
||||||
|
|
||||||
|
You can inspect the stdout logs in the output directory (generally `APM_<application_name>.txt` or `APM_<application_name>.run<n>.txt`) to help
|
||||||
|
determine what may have gone wrong from the output logs. Please file issues against the samples repository if you believe a sample is failing
|
||||||
|
incorrectly on your system.
|
||||||
|
|
||||||
## Samples list
|
## Samples list
|
||||||
|
|
||||||
### [0. Introduction](./Samples/0_Introduction/README.md)
|
### [0. Introduction](./Samples/0_Introduction/README.md)
|
||||||
@ -170,7 +343,7 @@ These third-party dependencies are required by some CUDA samples. If available,
|
|||||||
|
|
||||||
FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.
|
FreeImage is an open source imaging library. FreeImage can usually be installed on Linux using your distribution's package manager system. FreeImage can also be downloaded from the FreeImage website.
|
||||||
|
|
||||||
To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `../../../Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFREEIMAGE_INCLUDE_DIR` and `-DFREEIMAGE_LIBRARY` options.
|
To set up FreeImage on a Windows system, extract the FreeImage DLL distribution into the folder `./Common/FreeImage/Dist/x64` such that it contains the .h and .lib files. Copy the .dll file to the Release/ Debug/ execution folder or pass the FreeImage folder when cmake configuring with the `-DFreeImage_INCLUDE_DIR` and `-DFreeImage_LIBRARY` options.
|
||||||
|
|
||||||
#### Message Passing Interface
|
#### Message Passing Interface
|
||||||
|
|
||||||
@ -203,11 +376,11 @@ Vulkan is a low-overhead, cross-platform 3D graphics and compute API. Vulkan tar
|
|||||||
#### GLFW
|
#### GLFW
|
||||||
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
|
GLFW is a lightweight, open-source library designed for managing OpenGL, OpenGL ES, and Vulkan contexts. It simplifies the process of creating and managing windows, handling user input (keyboard, mouse, and joystick), and working with multiple monitors in a cross-platform manner.
|
||||||
|
|
||||||
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header as `-DGLFW_INCLUDE_DIR` for cmake configuring and follow the Build_instructions.txt in the sample folder to set up the t.
|
To set up GLFW on a Windows system, Download the pre-built binaries from [GLFW website](https://www.glfw.org/download.html) and extract the zip file into the folder, pass the GLFW include header folder as `-DGLFW_INCLUDE_DIR` and lib folder as `-DGLFW_LIB_DIR` for cmake configuring.
|
||||||
|
|
||||||
#### OpenMP
|
#### OpenMP
|
||||||
|
|
||||||
OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/).
|
OpenMP is an API for multiprocessing programming. OpenMP can be installed using your Linux distribution's package manager system. It usually comes preinstalled with GCC. It can also be found at the [OpenMP website](http://openmp.org/). For compilers such as clang, `libomp.so` and other components for LLVM must be installed separated. You will also need to set additional flags in your CMake configuration files, such as: `-DOpenMP_CXX_FLAGS="-fopenmp=libomp" -DOpenMP_CXX_LIB_NAMES="omp" -DOpenMP_omp_LIBRARY="/path/to/libomp.so"`.
|
||||||
|
|
||||||
#### Screen
|
#### Screen
|
||||||
|
|
||||||
|
@ -1,20 +1,3 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
|
||||||
|
|
||||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake/Modules")
|
|
||||||
|
|
||||||
project(simpleCallback LANGUAGES C CXX CUDA)
|
|
||||||
|
|
||||||
find_package(CUDAToolkit REQUIRED)
|
|
||||||
|
|
||||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|
||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
add_subdirectory(UnifiedMemoryStreams)
|
add_subdirectory(UnifiedMemoryStreams)
|
||||||
add_subdirectory(asyncAPI)
|
add_subdirectory(asyncAPI)
|
||||||
add_subdirectory(clock)
|
add_subdirectory(clock)
|
||||||
@ -55,6 +38,7 @@ add_subdirectory(simpleTexture3D)
|
|||||||
add_subdirectory(simpleTextureDrv)
|
add_subdirectory(simpleTextureDrv)
|
||||||
add_subdirectory(simpleVoteIntrinsics)
|
add_subdirectory(simpleVoteIntrinsics)
|
||||||
add_subdirectory(simpleZeroCopy)
|
add_subdirectory(simpleZeroCopy)
|
||||||
|
add_subdirectory(template)
|
||||||
add_subdirectory(systemWideAtomics)
|
add_subdirectory(systemWideAtomics)
|
||||||
add_subdirectory(vectorAdd)
|
add_subdirectory(vectorAdd)
|
||||||
add_subdirectory(vectorAddDrv)
|
add_subdirectory(vectorAddDrv)
|
||||||
|
@ -10,15 +10,21 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
include_directories(../../../Common)
|
include_directories(../../../Common)
|
||||||
|
|
||||||
# Source file
|
# Source file
|
||||||
find_package(OpenMP REQUIRED)
|
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||||
|
find_package(OpenMP REQUIRED C CXX)
|
||||||
|
else()
|
||||||
|
find_package(OpenMP REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
if(${OpenMP_FOUND})
|
if(${OpenMP_FOUND})
|
||||||
# Add target for UnifiedMemoryStreams
|
# Add target for UnifiedMemoryStreams
|
||||||
|
@ -28,7 +28,7 @@ cudaStreamDestroy, cudaFree, cudaMallocManaged, cudaStreamAttachMemAsync, cudaSe
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -31,10 +31,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// system includes
|
// system includes
|
||||||
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#else
|
#else
|
||||||
@ -51,291 +51,287 @@
|
|||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
|
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
|
||||||
// functions
|
// functions
|
||||||
void srand48(long seed) { srand((unsigned int)seed); }
|
void srand48(long seed) { srand((unsigned int)seed); }
|
||||||
double drand48() { return double(rand()) / RAND_MAX; }
|
double drand48() { return double(rand()) / RAND_MAX; }
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const char *sSDKname = "UnifiedMemoryStreams";
|
const char *sSDKname = "UnifiedMemoryStreams";
|
||||||
|
|
||||||
// simple task
|
// simple task
|
||||||
template <typename T>
|
template <typename T> struct Task
|
||||||
struct Task {
|
{
|
||||||
unsigned int size, id;
|
unsigned int size, id;
|
||||||
T *data;
|
T *data;
|
||||||
T *result;
|
T *result;
|
||||||
T *vector;
|
T *vector;
|
||||||
|
|
||||||
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
|
Task()
|
||||||
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
|
: size(0)
|
||||||
// allocate unified memory -- the operation performed in this example will
|
, id(0)
|
||||||
// be a DGEMV
|
, data(NULL)
|
||||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
, result(NULL)
|
||||||
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
, vector(NULL) {};
|
||||||
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
Task(unsigned int s)
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
: size(s)
|
||||||
}
|
, id(0)
|
||||||
|
, data(NULL)
|
||||||
~Task() {
|
, result(NULL)
|
||||||
// ensure all memory is deallocated
|
{
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
// allocate unified memory -- the operation performed in this example will
|
||||||
checkCudaErrors(cudaFree(data));
|
// be a DGEMV
|
||||||
checkCudaErrors(cudaFree(result));
|
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||||
checkCudaErrors(cudaFree(vector));
|
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
||||||
}
|
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
void allocate(const unsigned int s, const unsigned int unique_id) {
|
|
||||||
// allocate unified memory outside of constructor
|
|
||||||
id = unique_id;
|
|
||||||
size = s;
|
|
||||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
|
||||||
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
|
||||||
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
|
||||||
|
|
||||||
// populate data with random elements
|
|
||||||
for (unsigned int i = 0; i < size * size; i++) {
|
|
||||||
data[i] = drand48();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = 0; i < size; i++) {
|
~Task()
|
||||||
result[i] = 0.;
|
{
|
||||||
vector[i] = drand48();
|
// ensure all memory is deallocated
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
checkCudaErrors(cudaFree(data));
|
||||||
|
checkCudaErrors(cudaFree(result));
|
||||||
|
checkCudaErrors(cudaFree(vector));
|
||||||
|
}
|
||||||
|
|
||||||
|
void allocate(const unsigned int s, const unsigned int unique_id)
|
||||||
|
{
|
||||||
|
// allocate unified memory outside of constructor
|
||||||
|
id = unique_id;
|
||||||
|
size = s;
|
||||||
|
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||||
|
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
||||||
|
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
|
||||||
|
// populate data with random elements
|
||||||
|
for (unsigned int i = 0; i < size * size; i++) {
|
||||||
|
data[i] = drand48();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < size; i++) {
|
||||||
|
result[i] = 0.;
|
||||||
|
vector[i] = drand48();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
struct threadData_t {
|
struct threadData_t
|
||||||
int tid;
|
{
|
||||||
Task<double> *TaskListPtr;
|
int tid;
|
||||||
cudaStream_t *streams;
|
Task<double> *TaskListPtr;
|
||||||
cublasHandle_t *handles;
|
cudaStream_t *streams;
|
||||||
int taskSize;
|
cublasHandle_t *handles;
|
||||||
|
int taskSize;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct threadData_t threadData;
|
typedef struct threadData_t threadData;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// simple host dgemv: assume data is in row-major format and square
|
// simple host dgemv: assume data is in row-major format and square
|
||||||
template <typename T>
|
template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
|
||||||
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
|
{
|
||||||
// rows
|
// rows
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
result[i] *= beta;
|
result[i] *= beta;
|
||||||
|
|
||||||
for (int j = 0; j < n; j++) {
|
for (int j = 0; j < n; j++) {
|
||||||
result[i] += A[i * n + j] * x[j];
|
result[i] += A[i * n + j] * x[j];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// execute a single task on either host or device depending on size
|
// execute a single task on either host or device depending on size
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
void *execute(void *inpArgs) {
|
void *execute(void *inpArgs)
|
||||||
threadData *dataPtr = (threadData *)inpArgs;
|
{
|
||||||
cudaStream_t *stream = dataPtr->streams;
|
threadData *dataPtr = (threadData *)inpArgs;
|
||||||
cublasHandle_t *handle = dataPtr->handles;
|
cudaStream_t *stream = dataPtr->streams;
|
||||||
int tid = dataPtr->tid;
|
cublasHandle_t *handle = dataPtr->handles;
|
||||||
|
int tid = dataPtr->tid;
|
||||||
|
|
||||||
for (int i = 0; i < dataPtr->taskSize; i++) {
|
for (int i = 0; i < dataPtr->taskSize; i++) {
|
||||||
Task<double> &t = dataPtr->TaskListPtr[i];
|
Task<double> &t = dataPtr->TaskListPtr[i];
|
||||||
|
|
||||||
if (t.size < 100) {
|
if (t.size < 100) {
|
||||||
// perform on host
|
// perform on host
|
||||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
|
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
|
||||||
t.size);
|
|
||||||
|
|
||||||
// attach managed memory to a (dummy) stream to allow host access while
|
// attach managed memory to a (dummy) stream to allow host access while
|
||||||
// the device is running
|
// the device is running
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
// call the host operation
|
||||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
}
|
||||||
// call the host operation
|
else {
|
||||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
// perform on device
|
||||||
} else {
|
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
|
||||||
// perform on device
|
double one = 1.0;
|
||||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
|
double zero = 0.0;
|
||||||
t.size);
|
|
||||||
double one = 1.0;
|
|
||||||
double zero = 0.0;
|
|
||||||
|
|
||||||
// attach managed memory to my stream
|
// attach managed memory to my stream
|
||||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
// call the device operation
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
|
checkCudaErrors(cublasDgemv(
|
||||||
cudaMemAttachSingle));
|
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
|
||||||
// call the device operation
|
}
|
||||||
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
|
|
||||||
&one, t.data, t.size, t.vector, 1, &zero,
|
|
||||||
t.result, 1));
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pthread_exit(NULL);
|
pthread_exit(NULL);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template <typename T>
|
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
|
||||||
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
|
{
|
||||||
int tid) {
|
if (t.size < 100) {
|
||||||
if (t.size < 100) {
|
// perform on host
|
||||||
// perform on host
|
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
|
||||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
|
|
||||||
t.size);
|
|
||||||
|
|
||||||
// attach managed memory to a (dummy) stream to allow host access while the
|
// attach managed memory to a (dummy) stream to allow host access while the
|
||||||
// device is running
|
// device is running
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
// call the host operation
|
||||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
}
|
||||||
// call the host operation
|
else {
|
||||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
// perform on device
|
||||||
} else {
|
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
|
||||||
// perform on device
|
double one = 1.0;
|
||||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
|
double zero = 0.0;
|
||||||
t.size);
|
|
||||||
double one = 1.0;
|
|
||||||
double zero = 0.0;
|
|
||||||
|
|
||||||
// attach managed memory to my stream
|
// attach managed memory to my stream
|
||||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
// call the device operation
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
|
checkCudaErrors(cublasDgemv(
|
||||||
cudaMemAttachSingle));
|
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
|
||||||
// call the device operation
|
}
|
||||||
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
|
|
||||||
&one, t.data, t.size, t.vector, 1, &zero,
|
|
||||||
t.result, 1));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// populate a list of tasks with random sizes
|
// populate a list of tasks with random sizes
|
||||||
template <typename T>
|
template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
|
||||||
void initialise_tasks(std::vector<Task<T> > &TaskList) {
|
{
|
||||||
for (unsigned int i = 0; i < TaskList.size(); i++) {
|
for (unsigned int i = 0; i < TaskList.size(); i++) {
|
||||||
// generate random size
|
// generate random size
|
||||||
int size;
|
int size;
|
||||||
size = std::max((int)(drand48() * 1000.0), 64);
|
size = std::max((int)(drand48() * 1000.0), 64);
|
||||||
TaskList[i].allocate(size, i);
|
TaskList[i].allocate(size, i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
// set device
|
{
|
||||||
cudaDeviceProp device_prop;
|
// set device
|
||||||
int dev_id = findCudaDevice(argc, (const char **)argv);
|
cudaDeviceProp device_prop;
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
|
int dev_id = findCudaDevice(argc, (const char **)argv);
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
|
||||||
|
|
||||||
if (!device_prop.managedMemory) {
|
if (!device_prop.managedMemory) {
|
||||||
// This samples requires being run on a device that supports Unified Memory
|
// This samples requires being run on a device that supports Unified Memory
|
||||||
fprintf(stderr, "Unified Memory not supported on this device\n");
|
fprintf(stderr, "Unified Memory not supported on this device\n");
|
||||||
|
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (device_prop.computeMode == cudaComputeModeProhibited) {
|
if (device_prop.computeMode == cudaComputeModeProhibited) {
|
||||||
// This sample requires being run with a default or process exclusive mode
|
// This sample requires being run with a default or process exclusive mode
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"This sample requires a device in either default or process "
|
"This sample requires a device in either default or process "
|
||||||
"exclusive mode\n");
|
"exclusive mode\n");
|
||||||
|
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
}
|
||||||
|
|
||||||
// randomise task sizes
|
// randomise task sizes
|
||||||
int seed = (int)time(NULL);
|
int seed = (int)time(NULL);
|
||||||
srand48(seed);
|
srand48(seed);
|
||||||
|
|
||||||
// set number of threads
|
// set number of threads
|
||||||
const int nthreads = 4;
|
const int nthreads = 4;
|
||||||
|
|
||||||
// number of streams = number of threads
|
// number of streams = number of threads
|
||||||
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
|
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
|
||||||
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
|
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
|
||||||
|
|
||||||
for (int i = 0; i < nthreads + 1; i++) {
|
for (int i = 0; i < nthreads + 1; i++) {
|
||||||
checkCudaErrors(cudaStreamCreate(&streams[i]));
|
checkCudaErrors(cudaStreamCreate(&streams[i]));
|
||||||
checkCudaErrors(cublasCreate(&handles[i]));
|
checkCudaErrors(cublasCreate(&handles[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
// create list of N tasks
|
// create list of N tasks
|
||||||
unsigned int N = 40;
|
unsigned int N = 40;
|
||||||
std::vector<Task<double> > TaskList(N);
|
std::vector<Task<double>> TaskList(N);
|
||||||
initialise_tasks(TaskList);
|
initialise_tasks(TaskList);
|
||||||
|
|
||||||
printf("Executing tasks on host / device\n");
|
printf("Executing tasks on host / device\n");
|
||||||
|
|
||||||
// run through all tasks using threads and streams
|
// run through all tasks using threads and streams
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
pthread_t threads[nthreads];
|
pthread_t threads[nthreads];
|
||||||
threadData *InputToThreads = new threadData[nthreads];
|
threadData *InputToThreads = new threadData[nthreads];
|
||||||
|
|
||||||
for (int i = 0; i < nthreads; i++) {
|
for (int i = 0; i < nthreads; i++) {
|
||||||
checkCudaErrors(cudaSetDevice(dev_id));
|
checkCudaErrors(cudaSetDevice(dev_id));
|
||||||
InputToThreads[i].tid = i;
|
InputToThreads[i].tid = i;
|
||||||
InputToThreads[i].streams = streams;
|
InputToThreads[i].streams = streams;
|
||||||
InputToThreads[i].handles = handles;
|
InputToThreads[i].handles = handles;
|
||||||
|
|
||||||
if ((TaskList.size() / nthreads) == 0) {
|
if ((TaskList.size() / nthreads) == 0) {
|
||||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||||
InputToThreads[i].TaskListPtr =
|
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
|
||||||
&TaskList[i * (TaskList.size() / nthreads)];
|
}
|
||||||
} else {
|
else {
|
||||||
if (i == nthreads - 1) {
|
if (i == nthreads - 1) {
|
||||||
InputToThreads[i].taskSize =
|
InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
|
||||||
(TaskList.size() / nthreads) + (TaskList.size() % nthreads);
|
InputToThreads[i].TaskListPtr =
|
||||||
InputToThreads[i].TaskListPtr =
|
&TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
|
||||||
&TaskList[i * (TaskList.size() / nthreads) +
|
}
|
||||||
(TaskList.size() % nthreads)];
|
else {
|
||||||
} else {
|
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
|
||||||
InputToThreads[i].TaskListPtr =
|
}
|
||||||
&TaskList[i * (TaskList.size() / nthreads)];
|
}
|
||||||
}
|
|
||||||
|
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < nthreads; i++) {
|
||||||
|
pthread_join(threads[i], NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
|
|
||||||
}
|
|
||||||
for (int i = 0; i < nthreads; i++) {
|
|
||||||
pthread_join(threads[i], NULL);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
omp_set_num_threads(nthreads);
|
omp_set_num_threads(nthreads);
|
||||||
#pragma omp parallel for schedule(dynamic)
|
#pragma omp parallel for schedule(dynamic)
|
||||||
for (int i = 0; i < TaskList.size(); i++) {
|
for (int i = 0; i < TaskList.size(); i++) {
|
||||||
checkCudaErrors(cudaSetDevice(dev_id));
|
checkCudaErrors(cudaSetDevice(dev_id));
|
||||||
int tid = omp_get_thread_num();
|
int tid = omp_get_thread_num();
|
||||||
execute(TaskList[i], handles, streams, tid);
|
execute(TaskList[i], handles, streams, tid);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
|
|
||||||
// Destroy CUDA Streams, cuBlas handles
|
// Destroy CUDA Streams, cuBlas handles
|
||||||
for (int i = 0; i < nthreads + 1; i++) {
|
for (int i = 0; i < nthreads + 1; i++) {
|
||||||
cudaStreamDestroy(streams[i]);
|
cudaStreamDestroy(streams[i]);
|
||||||
cublasDestroy(handles[i]);
|
cublasDestroy(handles[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Free TaskList
|
// Free TaskList
|
||||||
std::vector<Task<double> >().swap(TaskList);
|
std::vector<Task<double>>().swap(TaskList);
|
||||||
|
|
||||||
printf("All Done!\n");
|
printf("All Done!\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaProfilerStop, cudaMalloc, cudaMemcpyAsync, cudaFree, cudaMallocHost, cudaPro
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -38,105 +38,107 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// includes CUDA Runtime
|
// includes CUDA Runtime
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <helper_functions.h> // helper utility functions
|
#include <helper_functions.h> // helper utility functions
|
||||||
|
|
||||||
__global__ void increment_kernel(int *g_data, int inc_value) {
|
__global__ void increment_kernel(int *g_data, int inc_value)
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
{
|
||||||
g_data[idx] = g_data[idx] + inc_value;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
g_data[idx] = g_data[idx] + inc_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool correct_output(int *data, const int n, const int x) {
|
bool correct_output(int *data, const int n, const int x)
|
||||||
for (int i = 0; i < n; i++)
|
{
|
||||||
if (data[i] != x) {
|
for (int i = 0; i < n; i++)
|
||||||
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
if (data[i] != x) {
|
||||||
return false;
|
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
int devID;
|
||||||
|
cudaDeviceProp deviceProps;
|
||||||
|
|
||||||
|
printf("[%s] - Starting...\n", argv[0]);
|
||||||
|
|
||||||
|
// This will pick the best possible CUDA capable device
|
||||||
|
devID = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
|
// get device name
|
||||||
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||||
|
printf("CUDA device [%s]\n", deviceProps.name);
|
||||||
|
|
||||||
|
int n = 16 * 1024 * 1024;
|
||||||
|
int nbytes = n * sizeof(int);
|
||||||
|
int value = 26;
|
||||||
|
|
||||||
|
// allocate host memory
|
||||||
|
int *a = 0;
|
||||||
|
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
||||||
|
memset(a, 0, nbytes);
|
||||||
|
|
||||||
|
// allocate device memory
|
||||||
|
int *d_a = 0;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
||||||
|
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
|
||||||
|
|
||||||
|
// set kernel launch configuration
|
||||||
|
dim3 threads = dim3(512, 1);
|
||||||
|
dim3 blocks = dim3(n / threads.x, 1);
|
||||||
|
|
||||||
|
// create cuda event handles
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
checkCudaErrors(cudaEventCreate(&start));
|
||||||
|
checkCudaErrors(cudaEventCreate(&stop));
|
||||||
|
|
||||||
|
StopWatchInterface *timer = NULL;
|
||||||
|
sdkCreateTimer(&timer);
|
||||||
|
sdkResetTimer(&timer);
|
||||||
|
|
||||||
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
float gpu_time = 0.0f;
|
||||||
|
|
||||||
|
// asynchronously issue work to the GPU (all to stream 0)
|
||||||
|
checkCudaErrors(cudaProfilerStart());
|
||||||
|
sdkStartTimer(&timer);
|
||||||
|
cudaEventRecord(start, 0);
|
||||||
|
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
|
||||||
|
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
|
||||||
|
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
|
||||||
|
cudaEventRecord(stop, 0);
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
checkCudaErrors(cudaProfilerStop());
|
||||||
|
|
||||||
|
// have CPU do some work while waiting for stage 1 to finish
|
||||||
|
unsigned long int counter = 0;
|
||||||
|
|
||||||
|
while (cudaEventQuery(stop) == cudaErrorNotReady) {
|
||||||
|
counter++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
|
||||||
}
|
|
||||||
|
// print the cpu and gpu times
|
||||||
int main(int argc, char *argv[]) {
|
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
||||||
int devID;
|
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
||||||
cudaDeviceProp deviceProps;
|
printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
|
||||||
|
|
||||||
printf("[%s] - Starting...\n", argv[0]);
|
// check the output for correctness
|
||||||
|
bool bFinalResults = correct_output(a, n, value);
|
||||||
// This will pick the best possible CUDA capable device
|
|
||||||
devID = findCudaDevice(argc, (const char **)argv);
|
// release resources
|
||||||
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
// get device name
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
checkCudaErrors(cudaFreeHost(a));
|
||||||
printf("CUDA device [%s]\n", deviceProps.name);
|
checkCudaErrors(cudaFree(d_a));
|
||||||
|
|
||||||
int n = 16 * 1024 * 1024;
|
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
int nbytes = n * sizeof(int);
|
|
||||||
int value = 26;
|
|
||||||
|
|
||||||
// allocate host memory
|
|
||||||
int *a = 0;
|
|
||||||
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
|
||||||
memset(a, 0, nbytes);
|
|
||||||
|
|
||||||
// allocate device memory
|
|
||||||
int *d_a = 0;
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
|
||||||
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
|
|
||||||
|
|
||||||
// set kernel launch configuration
|
|
||||||
dim3 threads = dim3(512, 1);
|
|
||||||
dim3 blocks = dim3(n / threads.x, 1);
|
|
||||||
|
|
||||||
// create cuda event handles
|
|
||||||
cudaEvent_t start, stop;
|
|
||||||
checkCudaErrors(cudaEventCreate(&start));
|
|
||||||
checkCudaErrors(cudaEventCreate(&stop));
|
|
||||||
|
|
||||||
StopWatchInterface *timer = NULL;
|
|
||||||
sdkCreateTimer(&timer);
|
|
||||||
sdkResetTimer(&timer);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
|
||||||
float gpu_time = 0.0f;
|
|
||||||
|
|
||||||
// asynchronously issue work to the GPU (all to stream 0)
|
|
||||||
checkCudaErrors(cudaProfilerStart());
|
|
||||||
sdkStartTimer(&timer);
|
|
||||||
cudaEventRecord(start, 0);
|
|
||||||
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
|
|
||||||
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
|
|
||||||
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
|
|
||||||
cudaEventRecord(stop, 0);
|
|
||||||
sdkStopTimer(&timer);
|
|
||||||
checkCudaErrors(cudaProfilerStop());
|
|
||||||
|
|
||||||
// have CPU do some work while waiting for stage 1 to finish
|
|
||||||
unsigned long int counter = 0;
|
|
||||||
|
|
||||||
while (cudaEventQuery(stop) == cudaErrorNotReady) {
|
|
||||||
counter++;
|
|
||||||
}
|
|
||||||
|
|
||||||
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
|
|
||||||
|
|
||||||
// print the cpu and gpu times
|
|
||||||
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
|
||||||
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
|
||||||
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
|
|
||||||
counter);
|
|
||||||
|
|
||||||
// check the output for correctness
|
|
||||||
bool bFinalResults = correct_output(a, n, value);
|
|
||||||
|
|
||||||
// release resources
|
|
||||||
checkCudaErrors(cudaEventDestroy(start));
|
|
||||||
checkCudaErrors(cudaEventDestroy(stop));
|
|
||||||
checkCudaErrors(cudaFreeHost(a));
|
|
||||||
checkCudaErrors(cudaFree(d_a));
|
|
||||||
|
|
||||||
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaMalloc, cudaMemcpy, cudaFree
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -48,43 +48,46 @@
|
|||||||
// This kernel computes a standard parallel reduction and evaluates the
|
// This kernel computes a standard parallel reduction and evaluates the
|
||||||
// time it takes to do that for each block. The timing results are stored
|
// time it takes to do that for each block. The timing results are stored
|
||||||
// in device memory.
|
// in device memory.
|
||||||
__global__ static void timedReduction(const float *input, float *output,
|
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
|
||||||
clock_t *timer) {
|
{
|
||||||
// __shared__ float shared[2 * blockDim.x];
|
// __shared__ float shared[2 * blockDim.x];
|
||||||
extern __shared__ float shared[];
|
extern __shared__ float shared[];
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
|
|
||||||
if (tid == 0) timer[bid] = clock();
|
if (tid == 0)
|
||||||
|
timer[bid] = clock();
|
||||||
|
|
||||||
// Copy input.
|
// Copy input.
|
||||||
shared[tid] = input[tid];
|
shared[tid] = input[tid];
|
||||||
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
||||||
|
|
||||||
|
// Perform reduction to find minimum.
|
||||||
|
for (int d = blockDim.x; d > 0; d /= 2) {
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid < d) {
|
||||||
|
float f0 = shared[tid];
|
||||||
|
float f1 = shared[tid + d];
|
||||||
|
|
||||||
|
if (f1 < f0) {
|
||||||
|
shared[tid] = f1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write result.
|
||||||
|
if (tid == 0)
|
||||||
|
output[bid] = shared[0];
|
||||||
|
|
||||||
// Perform reduction to find minimum.
|
|
||||||
for (int d = blockDim.x; d > 0; d /= 2) {
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid < d) {
|
if (tid == 0)
|
||||||
float f0 = shared[tid];
|
timer[bid + gridDim.x] = clock();
|
||||||
float f1 = shared[tid + d];
|
|
||||||
|
|
||||||
if (f1 < f0) {
|
|
||||||
shared[tid] = f1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write result.
|
|
||||||
if (tid == 0) output[bid] = shared[0];
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (tid == 0) timer[bid + gridDim.x] = clock();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define NUM_BLOCKS 64
|
#define NUM_BLOCKS 64
|
||||||
#define NUM_THREADS 256
|
#define NUM_THREADS 256
|
||||||
|
|
||||||
// It's interesting to change the number of blocks and the number of threads to
|
// It's interesting to change the number of blocks and the number of threads to
|
||||||
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
|
|||||||
// the memory. With more than 32 the speed scales linearly.
|
// the memory. With more than 32 the speed scales linearly.
|
||||||
|
|
||||||
// Start the main CUDA Sample here
|
// Start the main CUDA Sample here
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("CUDA Clock sample\n");
|
{
|
||||||
|
printf("CUDA Clock sample\n");
|
||||||
|
|
||||||
// This will pick the best possible CUDA capable device
|
// This will pick the best possible CUDA capable device
|
||||||
int dev = findCudaDevice(argc, (const char **)argv);
|
int dev = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
float *dinput = NULL;
|
float *dinput = NULL;
|
||||||
float *doutput = NULL;
|
float *doutput = NULL;
|
||||||
clock_t *dtimer = NULL;
|
clock_t *dtimer = NULL;
|
||||||
|
|
||||||
clock_t timer[NUM_BLOCKS * 2];
|
clock_t timer[NUM_BLOCKS * 2];
|
||||||
float input[NUM_THREADS * 2];
|
float input[NUM_THREADS * 2];
|
||||||
|
|
||||||
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
||||||
input[i] = (float)i;
|
input[i] = (float)i;
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||||
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
|
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
|
||||||
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
|
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
|
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
|
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
|
||||||
dinput, doutput, dtimer);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
|
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaFree(dinput));
|
checkCudaErrors(cudaFree(dinput));
|
||||||
checkCudaErrors(cudaFree(doutput));
|
checkCudaErrors(cudaFree(doutput));
|
||||||
checkCudaErrors(cudaFree(dtimer));
|
checkCudaErrors(cudaFree(dtimer));
|
||||||
|
|
||||||
long double avgElapsedClocks = 0;
|
long double avgElapsedClocks = 0;
|
||||||
|
|
||||||
for (int i = 0; i < NUM_BLOCKS; i++) {
|
for (int i = 0; i < NUM_BLOCKS; i++) {
|
||||||
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
||||||
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
||||||
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -34,12 +34,11 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <nvrtc_helper.h>
|
#include <nvrtc_helper.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
@ -71,64 +70,68 @@
|
|||||||
|
|
||||||
// Start the main CUDA Sample here
|
// Start the main CUDA Sample here
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("CUDA Clock sample\n");
|
{
|
||||||
|
printf("CUDA Clock sample\n");
|
||||||
|
|
||||||
typedef long clock_t;
|
typedef long clock_t;
|
||||||
|
|
||||||
clock_t timer[NUM_BLOCKS * 2];
|
clock_t timer[NUM_BLOCKS * 2];
|
||||||
|
|
||||||
float input[NUM_THREADS * 2];
|
float input[NUM_THREADS * 2];
|
||||||
|
|
||||||
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
||||||
input[i] = (float)i;
|
input[i] = (float)i;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *cubin, *kernel_file;
|
char *cubin, *kernel_file;
|
||||||
size_t cubinSize;
|
size_t cubinSize;
|
||||||
|
|
||||||
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
|
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
|
||||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
||||||
|
|
||||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||||
CUfunction kernel_addr;
|
CUfunction kernel_addr;
|
||||||
|
|
||||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
|
||||||
|
|
||||||
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
|
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
|
||||||
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
|
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
|
||||||
|
|
||||||
CUdeviceptr dinput, doutput, dtimer;
|
CUdeviceptr dinput, doutput, dtimer;
|
||||||
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
|
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||||
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
|
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
|
||||||
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||||
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
|
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
|
||||||
|
|
||||||
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
|
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
kernel_addr, cudaGridSize.x, cudaGridSize.y,
|
cudaGridSize.x,
|
||||||
cudaGridSize.z, /* grid dim */
|
cudaGridSize.y,
|
||||||
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
|
cudaGridSize.z, /* grid dim */
|
||||||
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
|
cudaBlockSize.x,
|
||||||
&arr[0], /* arguments */
|
cudaBlockSize.y,
|
||||||
0));
|
cudaBlockSize.z, /* block dim */
|
||||||
|
sizeof(float) * 2 * NUM_THREADS,
|
||||||
|
0, /* shared mem, stream */
|
||||||
|
&arr[0], /* arguments */
|
||||||
|
0));
|
||||||
|
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||||
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
checkCudaErrors(cuMemFree(dinput));
|
||||||
checkCudaErrors(cuMemFree(dinput));
|
checkCudaErrors(cuMemFree(doutput));
|
||||||
checkCudaErrors(cuMemFree(doutput));
|
checkCudaErrors(cuMemFree(dtimer));
|
||||||
checkCudaErrors(cuMemFree(dtimer));
|
|
||||||
|
|
||||||
long double avgElapsedClocks = 0;
|
long double avgElapsedClocks = 0;
|
||||||
|
|
||||||
for (int i = 0; i < NUM_BLOCKS; i++) {
|
for (int i = 0; i < NUM_BLOCKS; i++) {
|
||||||
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
||||||
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
||||||
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -37,38 +37,41 @@
|
|||||||
// time it takes to do that for each block. The timing results are stored
|
// time it takes to do that for each block. The timing results are stored
|
||||||
// in device memory.
|
// in device memory.
|
||||||
|
|
||||||
extern "C" __global__ void timedReduction(const float *input, float *output,
|
extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
|
||||||
clock_t *timer) {
|
{
|
||||||
// __shared__ float shared[2 * blockDim.x];
|
// __shared__ float shared[2 * blockDim.x];
|
||||||
extern __shared__ float shared[];
|
extern __shared__ float shared[];
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
|
|
||||||
if (tid == 0) timer[bid] = clock();
|
if (tid == 0)
|
||||||
|
timer[bid] = clock();
|
||||||
|
|
||||||
// Copy input.
|
// Copy input.
|
||||||
shared[tid] = input[tid];
|
shared[tid] = input[tid];
|
||||||
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
||||||
|
|
||||||
|
// Perform reduction to find minimum.
|
||||||
|
for (int d = blockDim.x; d > 0; d /= 2) {
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
if (tid < d) {
|
||||||
|
float f0 = shared[tid];
|
||||||
|
float f1 = shared[tid + d];
|
||||||
|
|
||||||
|
if (f1 < f0) {
|
||||||
|
shared[tid] = f1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write result.
|
||||||
|
if (tid == 0)
|
||||||
|
output[bid] = shared[0];
|
||||||
|
|
||||||
// Perform reduction to find minimum.
|
|
||||||
for (int d = blockDim.x; d > 0; d /= 2) {
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid < d) {
|
if (tid == 0)
|
||||||
float f0 = shared[tid];
|
timer[bid + gridDim.x] = clock();
|
||||||
float f1 = shared[tid + d];
|
|
||||||
|
|
||||||
if (f1 < f0) {
|
|
||||||
shared[tid] = f1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write result.
|
|
||||||
if (tid == 0) output[bid] = shared[0];
|
|
||||||
|
|
||||||
__syncthreads();
|
|
||||||
|
|
||||||
if (tid == 0) timer[bid + gridDim.x] = clock();
|
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -30,7 +30,7 @@ cudaMemcpy, cudaGetErrorString, cudaFree, cudaGetLastError, cudaSetDevice, cudaG
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -32,128 +32,125 @@
|
|||||||
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <omp.h>
|
#include <omp.h>
|
||||||
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
|
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
// a simple kernel that simply increments each array element by b
|
// a simple kernel that simply increments each array element by b
|
||||||
__global__ void kernelAddConstant(int *g_a, const int b) {
|
__global__ void kernelAddConstant(int *g_a, const int b)
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
{
|
||||||
g_a[idx] += b;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
g_a[idx] += b;
|
||||||
}
|
}
|
||||||
|
|
||||||
// a predicate that checks whether each array element is set to its index plus b
|
// a predicate that checks whether each array element is set to its index plus b
|
||||||
int correctResult(int *data, const int n, const int b) {
|
int correctResult(int *data, const int n, const int b)
|
||||||
for (int i = 0; i < n; i++)
|
{
|
||||||
if (data[i] != i + b) return 0;
|
for (int i = 0; i < n; i++)
|
||||||
|
if (data[i] != i + b)
|
||||||
|
return 0;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
int num_gpus = 0; // number of CUDA GPUs
|
{
|
||||||
|
int num_gpus = 0; // number of CUDA GPUs
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// determine the number of CUDA capable GPUs
|
// determine the number of CUDA capable GPUs
|
||||||
//
|
//
|
||||||
cudaGetDeviceCount(&num_gpus);
|
cudaGetDeviceCount(&num_gpus);
|
||||||
|
|
||||||
if (num_gpus < 1) {
|
if (num_gpus < 1) {
|
||||||
printf("no CUDA capable devices were detected\n");
|
printf("no CUDA capable devices were detected\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// display CPU and GPU configuration
|
// display CPU and GPU configuration
|
||||||
//
|
//
|
||||||
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
|
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
|
||||||
printf("number of CUDA devices:\t%d\n", num_gpus);
|
printf("number of CUDA devices:\t%d\n", num_gpus);
|
||||||
|
|
||||||
for (int i = 0; i < num_gpus; i++) {
|
for (int i = 0; i < num_gpus; i++) {
|
||||||
cudaDeviceProp dprop;
|
cudaDeviceProp dprop;
|
||||||
cudaGetDeviceProperties(&dprop, i);
|
cudaGetDeviceProperties(&dprop, i);
|
||||||
printf(" %d: %s\n", i, dprop.name);
|
printf(" %d: %s\n", i, dprop.name);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("---------------------------\n");
|
printf("---------------------------\n");
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////////////
|
||||||
// initialize data
|
// initialize data
|
||||||
//
|
//
|
||||||
unsigned int n = num_gpus * 8192;
|
unsigned int n = num_gpus * 8192;
|
||||||
unsigned int nbytes = n * sizeof(int);
|
unsigned int nbytes = n * sizeof(int);
|
||||||
int *a = 0; // pointer to data on the CPU
|
int *a = 0; // pointer to data on the CPU
|
||||||
int b = 3; // value by which the array is incremented
|
int b = 3; // value by which the array is incremented
|
||||||
a = (int *)malloc(nbytes);
|
a = (int *)malloc(nbytes);
|
||||||
|
|
||||||
if (0 == a) {
|
if (0 == a) {
|
||||||
printf("couldn't allocate CPU memory\n");
|
printf("couldn't allocate CPU memory\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = 0; i < n; i++) a[i] = i;
|
for (unsigned int i = 0; i < n; i++)
|
||||||
|
a[i] = i;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// run as many CPU threads as there are CUDA devices
|
// run as many CPU threads as there are CUDA devices
|
||||||
// each CPU thread controls a different device, processing its
|
// each CPU thread controls a different device, processing its
|
||||||
// portion of the data. It's possible to use more CPU threads
|
// portion of the data. It's possible to use more CPU threads
|
||||||
// than there are CUDA devices, in which case several CPU
|
// than there are CUDA devices, in which case several CPU
|
||||||
// threads will be allocating resources and launching kernels
|
// threads will be allocating resources and launching kernels
|
||||||
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
|
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
|
||||||
// Recall that all variables declared inside an "omp parallel" scope are
|
// Recall that all variables declared inside an "omp parallel" scope are
|
||||||
// local to each CPU thread
|
// local to each CPU thread
|
||||||
//
|
//
|
||||||
omp_set_num_threads(
|
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
|
||||||
num_gpus); // create as many CPU threads as there are CUDA devices
|
|
||||||
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
|
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
|
||||||
// are CUDA devices
|
// are CUDA devices
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
unsigned int cpu_thread_id = omp_get_thread_num();
|
unsigned int cpu_thread_id = omp_get_thread_num();
|
||||||
unsigned int num_cpu_threads = omp_get_num_threads();
|
unsigned int num_cpu_threads = omp_get_num_threads();
|
||||||
|
|
||||||
// set and check the CUDA device for this CPU thread
|
// set and check the CUDA device for this CPU thread
|
||||||
int gpu_id = -1;
|
int gpu_id = -1;
|
||||||
checkCudaErrors(cudaSetDevice(
|
checkCudaErrors(
|
||||||
cpu_thread_id %
|
cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
|
||||||
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
|
checkCudaErrors(cudaGetDevice(&gpu_id));
|
||||||
checkCudaErrors(cudaGetDevice(&gpu_id));
|
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
|
||||||
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
|
|
||||||
num_cpu_threads, gpu_id);
|
|
||||||
|
|
||||||
int *d_a =
|
int *d_a = 0; // pointer to memory on the device associated with this CPU thread
|
||||||
0; // pointer to memory on the device associated with this CPU thread
|
int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
|
||||||
int *sub_a =
|
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
|
||||||
a +
|
dim3 gpu_threads(128); // 128 threads per block
|
||||||
cpu_thread_id * n /
|
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
|
||||||
num_cpu_threads; // pointer to this CPU thread's portion of data
|
|
||||||
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
|
|
||||||
dim3 gpu_threads(128); // 128 threads per block
|
|
||||||
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
|
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
|
||||||
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
|
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
|
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
|
||||||
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
|
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
|
checkCudaErrors(cudaFree(d_a));
|
||||||
checkCudaErrors(cudaFree(d_a));
|
}
|
||||||
}
|
printf("---------------------------\n");
|
||||||
printf("---------------------------\n");
|
|
||||||
|
|
||||||
if (cudaSuccess != cudaGetLastError())
|
if (cudaSuccess != cudaGetLastError())
|
||||||
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
|
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// check the result
|
// check the result
|
||||||
//
|
//
|
||||||
bool bResult = correctResult(a, n, b);
|
bool bResult = correctResult(a, n, b);
|
||||||
|
|
||||||
if (a) free(a); // free CPU memory
|
if (a)
|
||||||
|
free(a); // free CPU memory
|
||||||
|
|
||||||
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
@ -9,8 +9,10 @@ find_package(CUDAToolkit REQUIRED)
|
|||||||
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -30,7 +30,7 @@ cudaMemcpy, cudaFree, cudaMallocHost, cudaFreeHost, cudaMalloc, cudaGetDevicePro
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -25,191 +25,188 @@
|
|||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "cuda_fp16.h"
|
|
||||||
#include "helper_cuda.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
#define NUM_OF_BLOCKS 128
|
#include "cuda_fp16.h"
|
||||||
|
#include "helper_cuda.h"
|
||||||
|
|
||||||
|
#define NUM_OF_BLOCKS 128
|
||||||
#define NUM_OF_THREADS 128
|
#define NUM_OF_THREADS 128
|
||||||
|
|
||||||
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
|
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
|
||||||
if (threadIdx.x < 64)
|
{
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
|
if (threadIdx.x < 64)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
|
||||||
if (threadIdx.x < 32)
|
__syncthreads();
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
|
if (threadIdx.x < 32)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
|
||||||
if (threadIdx.x < 16)
|
__syncthreads();
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
|
if (threadIdx.x < 16)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
|
||||||
if (threadIdx.x < 8)
|
__syncthreads();
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
|
if (threadIdx.x < 8)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
|
||||||
if (threadIdx.x < 4)
|
__syncthreads();
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
|
if (threadIdx.x < 4)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
|
||||||
if (threadIdx.x < 2)
|
__syncthreads();
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
|
if (threadIdx.x < 2)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
|
||||||
if (threadIdx.x < 1)
|
__syncthreads();
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
|
if (threadIdx.x < 1)
|
||||||
__syncthreads();
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
|
__forceinline__ __device__ void reduceInShared_native(half2 *const v)
|
||||||
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
|
{
|
||||||
__syncthreads();
|
if (threadIdx.x < 64)
|
||||||
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
|
if (threadIdx.x < 32)
|
||||||
__syncthreads();
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
|
||||||
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
|
__syncthreads();
|
||||||
__syncthreads();
|
if (threadIdx.x < 16)
|
||||||
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
|
if (threadIdx.x < 8)
|
||||||
__syncthreads();
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
|
||||||
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
|
__syncthreads();
|
||||||
__syncthreads();
|
if (threadIdx.x < 4)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
|
||||||
|
__syncthreads();
|
||||||
|
if (threadIdx.x < 2)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
|
||||||
|
__syncthreads();
|
||||||
|
if (threadIdx.x < 1)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void scalarProductKernel_intrinsics(half2 const *const a,
|
__global__ void
|
||||||
half2 const *const b,
|
scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
|
||||||
float *const results,
|
{
|
||||||
size_t const size) {
|
const int stride = gridDim.x * blockDim.x;
|
||||||
const int stride = gridDim.x * blockDim.x;
|
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
|
||||||
|
|
||||||
shArray[threadIdx.x] = __float2half2_rn(0.f);
|
shArray[threadIdx.x] = __float2half2_rn(0.f);
|
||||||
half2 value = __float2half2_rn(0.f);
|
half2 value = __float2half2_rn(0.f);
|
||||||
|
|
||||||
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
||||||
value = __hfma2(a[i], b[i], value);
|
value = __hfma2(a[i], b[i], value);
|
||||||
}
|
}
|
||||||
|
|
||||||
shArray[threadIdx.x] = value;
|
shArray[threadIdx.x] = value;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
reduceInShared_intrinsics(shArray);
|
reduceInShared_intrinsics(shArray);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
half2 result = shArray[0];
|
half2 result = shArray[0];
|
||||||
float f_result = __low2float(result) + __high2float(result);
|
float f_result = __low2float(result) + __high2float(result);
|
||||||
results[blockIdx.x] = f_result;
|
results[blockIdx.x] = f_result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void scalarProductKernel_native(half2 const *const a,
|
__global__ void
|
||||||
half2 const *const b,
|
scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
|
||||||
float *const results,
|
{
|
||||||
size_t const size) {
|
const int stride = gridDim.x * blockDim.x;
|
||||||
const int stride = gridDim.x * blockDim.x;
|
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
|
||||||
|
|
||||||
half2 value(0.f, 0.f);
|
half2 value(0.f, 0.f);
|
||||||
shArray[threadIdx.x] = value;
|
shArray[threadIdx.x] = value;
|
||||||
|
|
||||||
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
||||||
value = a[i] * b[i] + value;
|
value = a[i] * b[i] + value;
|
||||||
}
|
}
|
||||||
|
|
||||||
shArray[threadIdx.x] = value;
|
shArray[threadIdx.x] = value;
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
reduceInShared_native(shArray);
|
reduceInShared_native(shArray);
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
half2 result = shArray[0];
|
half2 result = shArray[0];
|
||||||
float f_result = (float)result.y + (float)result.x;
|
float f_result = (float)result.y + (float)result.x;
|
||||||
results[blockIdx.x] = f_result;
|
results[blockIdx.x] = f_result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void generateInput(half2 *a, size_t size) {
|
void generateInput(half2 *a, size_t size)
|
||||||
for (size_t i = 0; i < size; ++i) {
|
{
|
||||||
half2 temp;
|
for (size_t i = 0; i < size; ++i) {
|
||||||
temp.x = static_cast<float>(rand() % 4);
|
half2 temp;
|
||||||
temp.y = static_cast<float>(rand() % 2);
|
temp.x = static_cast<float>(rand() % 4);
|
||||||
a[i] = temp;
|
temp.y = static_cast<float>(rand() % 2);
|
||||||
}
|
a[i] = temp;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
srand((unsigned int)time(NULL));
|
{
|
||||||
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
|
srand((unsigned int)time(NULL));
|
||||||
|
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
|
||||||
|
|
||||||
half2 *vec[2];
|
half2 *vec[2];
|
||||||
half2 *devVec[2];
|
half2 *devVec[2];
|
||||||
|
|
||||||
float *results;
|
float *results;
|
||||||
float *devResults;
|
float *devResults;
|
||||||
|
|
||||||
int devID = findCudaDevice(argc, (const char **)argv);
|
int devID = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
cudaDeviceProp devProp;
|
cudaDeviceProp devProp;
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
|
||||||
|
|
||||||
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
|
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
|
||||||
printf(
|
printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
|
||||||
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
|
"higher.\n");
|
||||||
"higher.\n");
|
return EXIT_WAIVED;
|
||||||
return EXIT_WAIVED;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < 2; ++i) {
|
||||||
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
|
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
|
||||||
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
|
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
|
||||||
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
|
checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
|
|
||||||
|
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < 2; ++i) {
|
||||||
generateInput(vec[i], size);
|
generateInput(vec[i], size);
|
||||||
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
|
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyHostToDevice));
|
}
|
||||||
}
|
|
||||||
|
|
||||||
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
|
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
|
||||||
devVec[0], devVec[1], devResults, size);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(results, devResults,
|
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
|
||||||
NUM_OF_BLOCKS * sizeof *results,
|
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
float result_native = 0;
|
float result_native = 0;
|
||||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||||
result_native += results[i];
|
result_native += results[i];
|
||||||
}
|
}
|
||||||
printf("Result native operators\t: %f \n", result_native);
|
printf("Result native operators\t: %f \n", result_native);
|
||||||
|
|
||||||
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
|
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
|
||||||
devVec[0], devVec[1], devResults, size);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(results, devResults,
|
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
|
||||||
NUM_OF_BLOCKS * sizeof *results,
|
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
float result_intrinsics = 0;
|
float result_intrinsics = 0;
|
||||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||||
result_intrinsics += results[i];
|
result_intrinsics += results[i];
|
||||||
}
|
}
|
||||||
printf("Result intrinsics\t: %f \n", result_intrinsics);
|
printf("Result intrinsics\t: %f \n", result_intrinsics);
|
||||||
|
|
||||||
printf("&&&& fp16ScalarProduct %s\n",
|
printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
|
||||||
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
|
|
||||||
: "FAILED");
|
|
||||||
|
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < 2; ++i) {
|
||||||
checkCudaErrors(cudaFree(devVec[i]));
|
checkCudaErrors(cudaFree(devVec[i]));
|
||||||
checkCudaErrors(cudaFreeHost(vec[i]));
|
checkCudaErrors(cudaFreeHost(vec[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaFree(devResults));
|
checkCudaErrors(cudaFree(devResults));
|
||||||
checkCudaErrors(cudaFreeHost(results));
|
checkCudaErrors(cudaFreeHost(results));
|
||||||
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||||
|
|
||||||
## Key Concepts
|
## Key Concepts
|
||||||
|
|
||||||
@ -27,6 +27,6 @@ cudaStreamCreateWithFlags, cudaProfilerStop, cudaMalloc, cudaFree, cudaMallocHos
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -40,314 +40,303 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// Helper functions and utilities to work with CUDA
|
// Helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
||||||
* wA is A's width and wB is B's width
|
* wA is A's width and wB is B's width
|
||||||
*/
|
*/
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
|
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
|
||||||
float *B, int wA,
|
{
|
||||||
int wB) {
|
// Block index
|
||||||
// Block index
|
int bx = blockIdx.x;
|
||||||
int bx = blockIdx.x;
|
int by = blockIdx.y;
|
||||||
int by = blockIdx.y;
|
|
||||||
|
|
||||||
// Thread index
|
// Thread index
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
int ty = threadIdx.y;
|
int ty = threadIdx.y;
|
||||||
|
|
||||||
// Index of the first sub-matrix of A processed by the block
|
// Index of the first sub-matrix of A processed by the block
|
||||||
int aBegin = wA * BLOCK_SIZE * by;
|
int aBegin = wA * BLOCK_SIZE * by;
|
||||||
|
|
||||||
// Index of the last sub-matrix of A processed by the block
|
// Index of the last sub-matrix of A processed by the block
|
||||||
int aEnd = aBegin + wA - 1;
|
int aEnd = aBegin + wA - 1;
|
||||||
|
|
||||||
// Step size used to iterate through the sub-matrices of A
|
// Step size used to iterate through the sub-matrices of A
|
||||||
int aStep = BLOCK_SIZE;
|
int aStep = BLOCK_SIZE;
|
||||||
|
|
||||||
// Index of the first sub-matrix of B processed by the block
|
// Index of the first sub-matrix of B processed by the block
|
||||||
int bBegin = BLOCK_SIZE * bx;
|
int bBegin = BLOCK_SIZE * bx;
|
||||||
|
|
||||||
// Step size used to iterate through the sub-matrices of B
|
// Step size used to iterate through the sub-matrices of B
|
||||||
int bStep = BLOCK_SIZE * wB;
|
int bStep = BLOCK_SIZE * wB;
|
||||||
|
|
||||||
// Csub is used to store the element of the block sub-matrix
|
// Csub is used to store the element of the block sub-matrix
|
||||||
// that is computed by the thread
|
// that is computed by the thread
|
||||||
float Csub = 0;
|
float Csub = 0;
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin;
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
a <= aEnd;
|
// Declaration of the shared memory array As used to
|
||||||
a += aStep, b += bStep) {
|
// store the sub-matrix of A
|
||||||
// Declaration of the shared memory array As used to
|
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
// store the sub-matrix of A
|
|
||||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
|
||||||
|
|
||||||
// Declaration of the shared memory array Bs used to
|
// Declaration of the shared memory array Bs used to
|
||||||
// store the sub-matrix of B
|
// store the sub-matrix of B
|
||||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
|
||||||
// Load the matrices from device memory
|
// Load the matrices from device memory
|
||||||
// to shared memory; each thread loads
|
// to shared memory; each thread loads
|
||||||
// one element of each matrix
|
// one element of each matrix
|
||||||
As[ty][tx] = A[a + wA * ty + tx];
|
As[ty][tx] = A[a + wA * ty + tx];
|
||||||
Bs[ty][tx] = B[b + wB * ty + tx];
|
Bs[ty][tx] = B[b + wB * ty + tx];
|
||||||
|
|
||||||
// Synchronize to make sure the matrices are loaded
|
// Synchronize to make sure the matrices are loaded
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Multiply the two matrices together;
|
// Multiply the two matrices together;
|
||||||
// each thread computes one element
|
// each thread computes one element
|
||||||
// of the block sub-matrix
|
// of the block sub-matrix
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
|
||||||
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
||||||
Csub += As[ty][k] * Bs[k][tx];
|
Csub += As[ty][k] * Bs[k][tx];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synchronize to make sure that the preceding
|
||||||
|
// computation is done before loading two new
|
||||||
|
// sub-matrices of A and B in the next iteration
|
||||||
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronize to make sure that the preceding
|
// Write the block sub-matrix to device memory;
|
||||||
// computation is done before loading two new
|
// each thread writes one element
|
||||||
// sub-matrices of A and B in the next iteration
|
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||||
__syncthreads();
|
C[c + wB * ty + tx] = Csub;
|
||||||
}
|
|
||||||
|
|
||||||
// Write the block sub-matrix to device memory;
|
|
||||||
// each thread writes one element
|
|
||||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
|
||||||
C[c + wB * ty + tx] = Csub;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConstantInit(float *data, int size, float val) {
|
void ConstantInit(float *data, int size, float val)
|
||||||
for (int i = 0; i < size; ++i) {
|
{
|
||||||
data[i] = val;
|
for (int i = 0; i < size; ++i) {
|
||||||
}
|
data[i] = val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run a simple test of matrix multiplication using CUDA
|
* Run a simple test of matrix multiplication using CUDA
|
||||||
*/
|
*/
|
||||||
int MatrixMultiply(int argc, char **argv,
|
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
|
||||||
int block_size, const dim3 &dimsA,
|
{
|
||||||
const dim3 &dimsB) {
|
// Allocate host memory for matrices A and B
|
||||||
// Allocate host memory for matrices A and B
|
unsigned int size_A = dimsA.x * dimsA.y;
|
||||||
unsigned int size_A = dimsA.x * dimsA.y;
|
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
float *h_A;
|
||||||
float *h_A;
|
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
|
||||||
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
|
unsigned int size_B = dimsB.x * dimsB.y;
|
||||||
unsigned int size_B = dimsB.x * dimsB.y;
|
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
float *h_B;
|
||||||
float *h_B;
|
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
|
||||||
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
|
cudaStream_t stream;
|
||||||
cudaStream_t stream;
|
|
||||||
|
|
||||||
// Initialize host memory
|
// Initialize host memory
|
||||||
const float valB = 0.01f;
|
const float valB = 0.01f;
|
||||||
ConstantInit(h_A, size_A, 1.0f);
|
ConstantInit(h_A, size_A, 1.0f);
|
||||||
ConstantInit(h_B, size_B, valB);
|
ConstantInit(h_B, size_B, valB);
|
||||||
|
|
||||||
// Allocate device memory
|
// Allocate device memory
|
||||||
float *d_A, *d_B, *d_C;
|
float *d_A, *d_B, *d_C;
|
||||||
|
|
||||||
// Allocate host matrix C
|
// Allocate host matrix C
|
||||||
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
||||||
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
||||||
float *h_C;
|
float *h_C;
|
||||||
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
|
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
|
||||||
|
|
||||||
if (h_C == NULL) {
|
if (h_C == NULL) {
|
||||||
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
|
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
|
||||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
|
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
|
||||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
|
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
|
||||||
// Allocate CUDA events that we'll use for timing
|
// Allocate CUDA events that we'll use for timing
|
||||||
cudaEvent_t start, stop;
|
cudaEvent_t start, stop;
|
||||||
checkCudaErrors(cudaEventCreate(&start));
|
checkCudaErrors(cudaEventCreate(&start));
|
||||||
checkCudaErrors(cudaEventCreate(&stop));
|
checkCudaErrors(cudaEventCreate(&stop));
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
|
|
||||||
// copy host memory to device
|
// copy host memory to device
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
||||||
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// Setup execution parameters
|
// Setup execution parameters
|
||||||
dim3 threads(block_size, block_size);
|
dim3 threads(block_size, block_size);
|
||||||
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
||||||
|
|
||||||
// Create and start timer
|
// Create and start timer
|
||||||
printf("Computing result using CUDA Kernel...\n");
|
printf("Computing result using CUDA Kernel...\n");
|
||||||
|
|
||||||
// Performs warmup operation using matrixMul CUDA kernel
|
// Performs warmup operation using matrixMul CUDA kernel
|
||||||
if (block_size == 16) {
|
|
||||||
MatrixMulCUDA<16>
|
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
|
||||||
} else {
|
|
||||||
MatrixMulCUDA<32>
|
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("done\n");
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
||||||
|
|
||||||
// Record the start event
|
|
||||||
checkCudaErrors(cudaEventRecord(start, stream));
|
|
||||||
|
|
||||||
// Execute the kernel
|
|
||||||
int nIter = 300;
|
|
||||||
|
|
||||||
for (int j = 0; j < nIter; j++) {
|
|
||||||
if (block_size == 16) {
|
if (block_size == 16) {
|
||||||
MatrixMulCUDA<16>
|
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
|
||||||
} else {
|
|
||||||
MatrixMulCUDA<32>
|
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
|
||||||
}
|
}
|
||||||
}
|
else {
|
||||||
|
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
// Record the stop event
|
|
||||||
checkCudaErrors(cudaEventRecord(stop, stream));
|
|
||||||
|
|
||||||
// Wait for the stop event to complete
|
|
||||||
checkCudaErrors(cudaEventSynchronize(stop));
|
|
||||||
|
|
||||||
float msecTotal = 0.0f;
|
|
||||||
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
|
|
||||||
|
|
||||||
// Compute and print the performance
|
|
||||||
float msecPerMatrixMul = msecTotal / nIter;
|
|
||||||
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
|
|
||||||
static_cast<double>(dimsA.y) *
|
|
||||||
static_cast<double>(dimsB.x);
|
|
||||||
double gigaFlops =
|
|
||||||
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
|
||||||
printf(
|
|
||||||
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
|
||||||
" WorkgroupSize= %u threads/block\n",
|
|
||||||
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
|
|
||||||
|
|
||||||
// Copy result from device to host
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
||||||
|
|
||||||
printf("Checking computed result for correctness: ");
|
|
||||||
bool correct = true;
|
|
||||||
|
|
||||||
// test relative error by the formula
|
|
||||||
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
|
||||||
double eps = 1.e-6; // machine zero
|
|
||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
|
|
||||||
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
|
||||||
double dot_length = dimsA.x;
|
|
||||||
double abs_val = fabs(h_C[i]);
|
|
||||||
double rel_err = abs_err / abs_val / dot_length;
|
|
||||||
|
|
||||||
if (rel_err > eps) {
|
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
|
|
||||||
i, h_C[i], dimsA.x * valB, eps);
|
|
||||||
correct = false;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
printf("done\n");
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
// Clean up memory
|
// Record the start event
|
||||||
checkCudaErrors(cudaFreeHost(h_A));
|
checkCudaErrors(cudaEventRecord(start, stream));
|
||||||
checkCudaErrors(cudaFreeHost(h_B));
|
|
||||||
checkCudaErrors(cudaFreeHost(h_C));
|
|
||||||
checkCudaErrors(cudaFree(d_A));
|
|
||||||
checkCudaErrors(cudaFree(d_B));
|
|
||||||
checkCudaErrors(cudaFree(d_C));
|
|
||||||
checkCudaErrors(cudaEventDestroy(start));
|
|
||||||
checkCudaErrors(cudaEventDestroy(stop));
|
|
||||||
printf(
|
|
||||||
"\nNOTE: The CUDA Samples are not meant for performance "
|
|
||||||
"measurements. Results may vary when GPU Boost is enabled.\n");
|
|
||||||
|
|
||||||
if (correct) {
|
// Execute the kernel
|
||||||
return EXIT_SUCCESS;
|
int nIter = 300;
|
||||||
} else {
|
|
||||||
return EXIT_FAILURE;
|
for (int j = 0; j < nIter; j++) {
|
||||||
}
|
if (block_size == 16) {
|
||||||
|
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Record the stop event
|
||||||
|
checkCudaErrors(cudaEventRecord(stop, stream));
|
||||||
|
|
||||||
|
// Wait for the stop event to complete
|
||||||
|
checkCudaErrors(cudaEventSynchronize(stop));
|
||||||
|
|
||||||
|
float msecTotal = 0.0f;
|
||||||
|
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
|
||||||
|
|
||||||
|
// Compute and print the performance
|
||||||
|
float msecPerMatrixMul = msecTotal / nIter;
|
||||||
|
double flopsPerMatrixMul =
|
||||||
|
2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
|
||||||
|
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
||||||
|
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
||||||
|
" WorkgroupSize= %u threads/block\n",
|
||||||
|
gigaFlops,
|
||||||
|
msecPerMatrixMul,
|
||||||
|
flopsPerMatrixMul,
|
||||||
|
threads.x * threads.y);
|
||||||
|
|
||||||
|
// Copy result from device to host
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
printf("Checking computed result for correctness: ");
|
||||||
|
bool correct = true;
|
||||||
|
|
||||||
|
// test relative error by the formula
|
||||||
|
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
||||||
|
double eps = 1.e-6; // machine zero
|
||||||
|
|
||||||
|
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
|
||||||
|
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
||||||
|
double dot_length = dimsA.x;
|
||||||
|
double abs_val = fabs(h_C[i]);
|
||||||
|
double rel_err = abs_err / abs_val / dot_length;
|
||||||
|
|
||||||
|
if (rel_err > eps) {
|
||||||
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
|
// Clean up memory
|
||||||
|
checkCudaErrors(cudaFreeHost(h_A));
|
||||||
|
checkCudaErrors(cudaFreeHost(h_B));
|
||||||
|
checkCudaErrors(cudaFreeHost(h_C));
|
||||||
|
checkCudaErrors(cudaFree(d_A));
|
||||||
|
checkCudaErrors(cudaFree(d_B));
|
||||||
|
checkCudaErrors(cudaFree(d_C));
|
||||||
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
|
printf("\nNOTE: The CUDA Samples are not meant for performance "
|
||||||
|
"measurements. Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
|
if (correct) {
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Program main
|
* Program main
|
||||||
*/
|
*/
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
{
|
||||||
|
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
printf(" Note: Outer matrix dimensions of A & B matrices"
|
||||||
printf(" Note: Outer matrix dimensions of A & B matrices" \
|
" must be equal.\n");
|
||||||
" must be equal.\n");
|
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
// This will pick the best possible CUDA capable device, otherwise
|
// This will pick the best possible CUDA capable device, otherwise
|
||||||
// override the device ID based on input provided at the command line
|
// override the device ID based on input provided at the command line
|
||||||
int dev = findCudaDevice(argc, (const char **)argv);
|
int dev = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
int block_size = 32;
|
int block_size = 32;
|
||||||
|
|
||||||
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
||||||
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
||||||
|
|
||||||
// width of Matrix A
|
// width of Matrix A
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
||||||
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
||||||
}
|
}
|
||||||
|
|
||||||
// height of Matrix A
|
// height of Matrix A
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
||||||
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
||||||
}
|
}
|
||||||
|
|
||||||
// width of Matrix B
|
// width of Matrix B
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
||||||
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
||||||
}
|
}
|
||||||
|
|
||||||
// height of Matrix B
|
// height of Matrix B
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
||||||
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dimsA.x != dimsB.y) {
|
if (dimsA.x != dimsB.y) {
|
||||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
|
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
|
||||||
dimsA.x, dimsB.y);
|
exit(EXIT_FAILURE);
|
||||||
exit(EXIT_FAILURE);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
|
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
|
||||||
dimsB.x, dimsB.y);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaProfilerStart());
|
checkCudaErrors(cudaProfilerStart());
|
||||||
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||||
checkCudaErrors(cudaProfilerStop());
|
checkCudaErrors(cudaProfilerStop());
|
||||||
|
|
||||||
exit(matrix_result);
|
exit(matrix_result);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
@ -38,6 +40,12 @@ target_link_libraries(matrixMulDrv PUBLIC
|
|||||||
set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/matrixMul_kernel64.fatbin")
|
set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/matrixMul_kernel64.fatbin")
|
||||||
set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/matrixMul_kernel.cu")
|
set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/matrixMul_kernel.cu")
|
||||||
|
|
||||||
|
# Construct GENCODE_FLAGS explicitly from CUDA architectures
|
||||||
|
set(GENCODE_FLAGS "")
|
||||||
|
foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
|
||||||
|
list(APPEND GENCODE_FLAGS "-gencode=arch=compute_${arch},code=sm_${arch}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${CUDA_FATBIN_FILE}
|
OUTPUT ${CUDA_FATBIN_FILE}
|
||||||
COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}
|
COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}
|
||||||
|
@ -27,6 +27,6 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuDeviceGetName, cuDeviceTotalMem, c
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -30,11 +30,11 @@
|
|||||||
|
|
||||||
// Matrix dimensions
|
// Matrix dimensions
|
||||||
// (chosen as multiples of the thread block size for simplicity)
|
// (chosen as multiples of the thread block size for simplicity)
|
||||||
#define WA (4 * block_size) // Matrix A width
|
#define WA (4 * block_size) // Matrix A width
|
||||||
#define HA (6 * block_size) // Matrix A height
|
#define HA (6 * block_size) // Matrix A height
|
||||||
#define WB (4 * block_size) // Matrix B width
|
#define WB (4 * block_size) // Matrix B width
|
||||||
#define HB WA // Matrix B height
|
#define HB WA // Matrix B height
|
||||||
#define WC WB // Matrix C width
|
#define WC WB // Matrix C width
|
||||||
#define HC HA // Matrix C height
|
#define HC HA // Matrix C height
|
||||||
|
|
||||||
#endif // _MATRIXMUL_H_
|
#endif // _MATRIXMUL_H_
|
||||||
|
@ -46,23 +46,23 @@
|
|||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <builtin_types.h>
|
#include <builtin_types.h>
|
||||||
#include <math.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, project, CUDA
|
// includes, project, CUDA
|
||||||
|
#include <cstring>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <helper_cuda_drvapi.h>
|
#include <helper_cuda_drvapi.h>
|
||||||
#include <helper_image.h>
|
#include <helper_image.h>
|
||||||
#include <helper_string.h>
|
#include <helper_string.h>
|
||||||
#include <helper_timer.h>
|
#include <helper_timer.h>
|
||||||
|
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "matrixMul.h"
|
#include "matrixMul.h"
|
||||||
|
|
||||||
|
|
||||||
@ -71,11 +71,9 @@
|
|||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
void randomInit(float *, int);
|
void randomInit(float *, int);
|
||||||
|
|
||||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||||
unsigned int, unsigned int);
|
|
||||||
|
|
||||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
|
||||||
int *blk_size);
|
|
||||||
|
|
||||||
#ifndef FATBIN_FILE
|
#ifndef FATBIN_FILE
|
||||||
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
||||||
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Globals
|
// Globals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
CUcontext cuContext;
|
CUcontext cuContext;
|
||||||
CUmodule cuModule;
|
CUmodule cuModule;
|
||||||
size_t totalGlobalMem;
|
size_t totalGlobalMem;
|
||||||
|
|
||||||
const char *sSDKsample = "matrixMulDrv (Driver API)";
|
const char *sSDKsample = "matrixMulDrv (Driver API)";
|
||||||
|
|
||||||
void constantInit(float *data, int size, float val) {
|
void constantInit(float *data, int size, float val)
|
||||||
for (int i = 0; i < size; ++i) {
|
{
|
||||||
data[i] = val;
|
for (int i = 0; i < size; ++i) {
|
||||||
}
|
data[i] = val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("[ %s ]\n", sSDKsample);
|
{
|
||||||
|
printf("[ %s ]\n", sSDKsample);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
// initialize CUDA
|
{
|
||||||
CUfunction matrixMul = NULL;
|
// initialize CUDA
|
||||||
int block_size = 0;
|
CUfunction matrixMul = NULL;
|
||||||
|
int block_size = 0;
|
||||||
|
|
||||||
initCUDA(argc, argv, &matrixMul, &block_size);
|
initCUDA(argc, argv, &matrixMul, &block_size);
|
||||||
|
|
||||||
// set seed for rand()
|
// set seed for rand()
|
||||||
srand(2006);
|
srand(2006);
|
||||||
|
|
||||||
// allocate host memory for matrices A and B
|
// allocate host memory for matrices A and B
|
||||||
unsigned int size_A = WA * HA;
|
unsigned int size_A = WA * HA;
|
||||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||||
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
|
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
|
||||||
unsigned int size_B = WB * HB;
|
unsigned int size_B = WB * HB;
|
||||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||||
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
|
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
|
||||||
|
|
||||||
// initialize host memory
|
// initialize host memory
|
||||||
const float valB = 0.01f;
|
const float valB = 0.01f;
|
||||||
constantInit(h_A, size_A, 1.0f);
|
constantInit(h_A, size_A, 1.0f);
|
||||||
constantInit(h_B, size_B, valB);
|
constantInit(h_B, size_B, valB);
|
||||||
|
|
||||||
// allocate device memory
|
// allocate device memory
|
||||||
CUdeviceptr d_A;
|
CUdeviceptr d_A;
|
||||||
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
||||||
CUdeviceptr d_B;
|
CUdeviceptr d_B;
|
||||||
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
||||||
|
|
||||||
// copy host memory to device
|
// copy host memory to device
|
||||||
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
||||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||||
|
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
size_t size_C = WC * HC;
|
size_t size_C = WC * HC;
|
||||||
size_t mem_size_C = sizeof(float) * size_C;
|
size_t mem_size_C = sizeof(float) * size_C;
|
||||||
|
|
||||||
CUdeviceptr d_C;
|
CUdeviceptr d_C;
|
||||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
|
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
|
||||||
|
|
||||||
// create and start timer
|
// create and start timer
|
||||||
StopWatchInterface *timer = NULL;
|
StopWatchInterface *timer = NULL;
|
||||||
sdkCreateTimer(&timer);
|
sdkCreateTimer(&timer);
|
||||||
|
|
||||||
// start the timer
|
// start the timer
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
// There are two ways to launch CUDA kernels via the Driver API.
|
// There are two ways to launch CUDA kernels via the Driver API.
|
||||||
// In this CUDA Sample, we illustrate both ways to pass parameters
|
// In this CUDA Sample, we illustrate both ways to pass parameters
|
||||||
// and specify parameters. By default we use the simpler method.
|
// and specify parameters. By default we use the simpler method.
|
||||||
dim3 block(block_size, block_size, 1);
|
dim3 block(block_size, block_size, 1);
|
||||||
dim3 grid(WC / block_size, HC / block_size, 1);
|
dim3 grid(WC / block_size, HC / block_size, 1);
|
||||||
|
|
||||||
if (1) {
|
if (1) {
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||||
// Launching (simplier method)
|
// Launching (simplier method)
|
||||||
size_t Matrix_Width_A = (size_t)WA;
|
size_t Matrix_Width_A = (size_t)WA;
|
||||||
size_t Matrix_Width_B = (size_t)WB;
|
size_t Matrix_Width_B = (size_t)WB;
|
||||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||||
// new CUDA 4.0 Driver API Kernel launch call
|
// new CUDA 4.0 Driver API Kernel launch call
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(matrixMul,
|
||||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
grid.x,
|
||||||
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
|
grid.y,
|
||||||
} else {
|
grid.z,
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
block.x,
|
||||||
// Launching (advanced method)
|
block.y,
|
||||||
int offset = 0;
|
block.z,
|
||||||
char argBuffer[256];
|
2 * block_size * block_size * sizeof(float),
|
||||||
|
NULL,
|
||||||
// pass in launch parameters (not actually de-referencing CUdeviceptr).
|
args,
|
||||||
// CUdeviceptr is storing the value of the parameters
|
NULL));
|
||||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
|
|
||||||
offset += sizeof(d_C);
|
|
||||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
|
|
||||||
offset += sizeof(d_A);
|
|
||||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
|
|
||||||
offset += sizeof(d_B);
|
|
||||||
|
|
||||||
size_t Matrix_Width_A = (size_t)WA;
|
|
||||||
size_t Matrix_Width_B = (size_t)WB;
|
|
||||||
|
|
||||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
|
|
||||||
offset += sizeof(Matrix_Width_A);
|
|
||||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
|
|
||||||
offset += sizeof(Matrix_Width_B);
|
|
||||||
|
|
||||||
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
|
|
||||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
|
|
||||||
CU_LAUNCH_PARAM_END};
|
|
||||||
|
|
||||||
// new CUDA 4.0 Driver API Kernel launch call
|
|
||||||
checkCudaErrors(cuLaunchKernel(
|
|
||||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
|
||||||
2 * block_size * block_size * sizeof(float), NULL, NULL,
|
|
||||||
reinterpret_cast<void **>(&kernel_launch_config)));
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy result from device to host
|
|
||||||
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
|
|
||||||
|
|
||||||
// stop and destroy timer
|
|
||||||
sdkStopTimer(&timer);
|
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
|
||||||
sdkDeleteTimer(&timer);
|
|
||||||
|
|
||||||
printf("Checking computed result for correctness: ");
|
|
||||||
bool correct = true;
|
|
||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
|
|
||||||
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
|
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
|
|
||||||
h_C[i], WA * valB);
|
|
||||||
correct = false;
|
|
||||||
}
|
}
|
||||||
}
|
else {
|
||||||
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||||
|
// Launching (advanced method)
|
||||||
|
int offset = 0;
|
||||||
|
char argBuffer[256];
|
||||||
|
|
||||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
// pass in launch parameters (not actually de-referencing CUdeviceptr).
|
||||||
|
// CUdeviceptr is storing the value of the parameters
|
||||||
|
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
|
||||||
|
offset += sizeof(d_C);
|
||||||
|
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
|
||||||
|
offset += sizeof(d_A);
|
||||||
|
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
|
||||||
|
offset += sizeof(d_B);
|
||||||
|
|
||||||
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
size_t Matrix_Width_A = (size_t)WA;
|
||||||
"Results may vary when GPU Boost is enabled.\n");
|
size_t Matrix_Width_B = (size_t)WB;
|
||||||
|
|
||||||
// clean up memory
|
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
|
||||||
free(h_A);
|
offset += sizeof(Matrix_Width_A);
|
||||||
free(h_B);
|
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
|
||||||
free(h_C);
|
offset += sizeof(Matrix_Width_B);
|
||||||
checkCudaErrors(cuMemFree(d_A));
|
|
||||||
checkCudaErrors(cuMemFree(d_B));
|
void *kernel_launch_config[5] = {
|
||||||
checkCudaErrors(cuMemFree(d_C));
|
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
|
||||||
checkCudaErrors(cuCtxDestroy(cuContext));
|
|
||||||
|
// new CUDA 4.0 Driver API Kernel launch call
|
||||||
|
checkCudaErrors(cuLaunchKernel(matrixMul,
|
||||||
|
grid.x,
|
||||||
|
grid.y,
|
||||||
|
grid.z,
|
||||||
|
block.x,
|
||||||
|
block.y,
|
||||||
|
block.z,
|
||||||
|
2 * block_size * block_size * sizeof(float),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
reinterpret_cast<void **>(&kernel_launch_config)));
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy result from device to host
|
||||||
|
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
|
||||||
|
|
||||||
|
// stop and destroy timer
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
|
printf("Checking computed result for correctness: ");
|
||||||
|
bool correct = true;
|
||||||
|
|
||||||
|
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
|
||||||
|
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
|
||||||
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
|
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||||
|
"Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
|
// clean up memory
|
||||||
|
free(h_A);
|
||||||
|
free(h_B);
|
||||||
|
free(h_C);
|
||||||
|
checkCudaErrors(cuMemFree(d_A));
|
||||||
|
checkCudaErrors(cuMemFree(d_B));
|
||||||
|
checkCudaErrors(cuMemFree(d_C));
|
||||||
|
checkCudaErrors(cuCtxDestroy(cuContext));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Allocates a matrix with random float entries.
|
// Allocates a matrix with random float entries.
|
||||||
void randomInit(float *data, int size) {
|
void randomInit(float *data, int size)
|
||||||
for (int i = 0; i < size; ++i) {
|
{
|
||||||
data[i] = rand() / static_cast<float>(RAND_MAX);
|
for (int i = 0; i < size; ++i) {
|
||||||
}
|
data[i] = rand() / static_cast<float>(RAND_MAX);
|
||||||
}
|
|
||||||
|
|
||||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|
||||||
int *blk_size) {
|
|
||||||
CUfunction cuFunction = 0;
|
|
||||||
int major = 0, minor = 0;
|
|
||||||
char deviceName[100];
|
|
||||||
|
|
||||||
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
|
||||||
|
|
||||||
// get compute capabilities and the devicename
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
|
||||||
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
|
|
||||||
printf(" Total amount of global memory: %llu bytes\n",
|
|
||||||
(long long unsigned int)totalGlobalMem);
|
|
||||||
|
|
||||||
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
|
|
||||||
|
|
||||||
// first search for the module path before we load the results
|
|
||||||
std::string module_path;
|
|
||||||
std::ostringstream fatbin;
|
|
||||||
|
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
} else {
|
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!fatbin.str().size()) {
|
|
||||||
printf("fatbin file empty. exiting..\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create module from binary file (FATBIN)
|
|
||||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
|
||||||
|
|
||||||
// select the suitable kernel function
|
|
||||||
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
|
|
||||||
"matrixMul_bs8_64bit"};
|
|
||||||
|
|
||||||
int idx = 0;
|
|
||||||
int block_size = 32;
|
|
||||||
while (idx < 3) {
|
|
||||||
int threadsPerBlock = 0;
|
|
||||||
int blocksPerGrid = 0;
|
|
||||||
|
|
||||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
|
||||||
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
|
||||||
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
|
|
||||||
2 * block_size * block_size * sizeof(float), 0));
|
|
||||||
if (block_size * block_size <= threadsPerBlock) {
|
|
||||||
printf("> %d block size selected\n", block_size);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
block_size /= 2;
|
|
||||||
}
|
}
|
||||||
idx++;
|
}
|
||||||
}
|
|
||||||
|
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
|
||||||
*pMatrixMul = cuFunction;
|
{
|
||||||
*blk_size = block_size;
|
CUfunction cuFunction = 0;
|
||||||
|
int major = 0, minor = 0;
|
||||||
return 0;
|
char deviceName[100];
|
||||||
|
|
||||||
|
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
||||||
|
|
||||||
|
// get compute capabilities and the devicename
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||||
|
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
||||||
|
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
||||||
|
|
||||||
|
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
|
||||||
|
printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
|
||||||
|
|
||||||
|
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
|
||||||
|
|
||||||
|
// first search for the module path before we load the results
|
||||||
|
std::string module_path;
|
||||||
|
std::ostringstream fatbin;
|
||||||
|
|
||||||
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fatbin.str().size()) {
|
||||||
|
printf("fatbin file empty. exiting..\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create module from binary file (FATBIN)
|
||||||
|
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||||
|
|
||||||
|
// select the suitable kernel function
|
||||||
|
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
|
||||||
|
|
||||||
|
int idx = 0;
|
||||||
|
int block_size = 32;
|
||||||
|
while (idx < 3) {
|
||||||
|
int threadsPerBlock = 0;
|
||||||
|
int blocksPerGrid = 0;
|
||||||
|
|
||||||
|
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
||||||
|
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
||||||
|
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
|
||||||
|
if (block_size * block_size <= threadsPerBlock) {
|
||||||
|
printf("> %d block size selected\n", block_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
block_size /= 2;
|
||||||
|
}
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
|
||||||
|
*pMatrixMul = cuFunction;
|
||||||
|
*blk_size = block_size;
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -42,86 +42,87 @@
|
|||||||
//! wA is A's width and wB is B's width
|
//! wA is A's width and wB is B's width
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <int block_size, typename size_type>
|
template <int block_size, typename size_type>
|
||||||
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
|
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
|
||||||
size_type wB) {
|
{
|
||||||
// Block index
|
// Block index
|
||||||
size_type bx = blockIdx.x;
|
size_type bx = blockIdx.x;
|
||||||
size_type by = blockIdx.y;
|
size_type by = blockIdx.y;
|
||||||
|
|
||||||
// Thread index
|
// Thread index
|
||||||
size_type tx = threadIdx.x;
|
size_type tx = threadIdx.x;
|
||||||
size_type ty = threadIdx.y;
|
size_type ty = threadIdx.y;
|
||||||
|
|
||||||
// Index of the first sub-matrix of A processed by the block
|
// Index of the first sub-matrix of A processed by the block
|
||||||
size_type aBegin = wA * block_size * by;
|
size_type aBegin = wA * block_size * by;
|
||||||
|
|
||||||
// Index of the last sub-matrix of A processed by the block
|
// Index of the last sub-matrix of A processed by the block
|
||||||
size_type aEnd = aBegin + wA - 1;
|
size_type aEnd = aBegin + wA - 1;
|
||||||
|
|
||||||
// Step size used to iterate through the sub-matrices of A
|
// Step size used to iterate through the sub-matrices of A
|
||||||
size_type aStep = block_size;
|
size_type aStep = block_size;
|
||||||
|
|
||||||
// Index of the first sub-matrix of B processed by the block
|
// Index of the first sub-matrix of B processed by the block
|
||||||
size_type bBegin = block_size * bx;
|
size_type bBegin = block_size * bx;
|
||||||
|
|
||||||
// Step size used to iterate through the sub-matrices of B
|
// Step size used to iterate through the sub-matrices of B
|
||||||
size_type bStep = block_size * wB;
|
size_type bStep = block_size * wB;
|
||||||
|
|
||||||
// Csub is used to store the element of the block sub-matrix
|
// Csub is used to store the element of the block sub-matrix
|
||||||
// that is computed by the thread
|
// that is computed by the thread
|
||||||
float Csub = 0;
|
float Csub = 0;
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A
|
// store the sub-matrix of A
|
||||||
__shared__ float As[block_size][block_size];
|
__shared__ float As[block_size][block_size];
|
||||||
|
|
||||||
// Declaration of the shared memory array Bs used to
|
// Declaration of the shared memory array Bs used to
|
||||||
// store the sub-matrix of B
|
// store the sub-matrix of B
|
||||||
__shared__ float Bs[block_size][block_size];
|
__shared__ float Bs[block_size][block_size];
|
||||||
|
|
||||||
// Load the matrices from device memory
|
// Load the matrices from device memory
|
||||||
// to shared memory; each thread loads
|
// to shared memory; each thread loads
|
||||||
// one element of each matrix
|
// one element of each matrix
|
||||||
AS(ty, tx) = A[a + wA * ty + tx];
|
AS(ty, tx) = A[a + wA * ty + tx];
|
||||||
BS(ty, tx) = B[b + wB * ty + tx];
|
BS(ty, tx) = B[b + wB * ty + tx];
|
||||||
|
|
||||||
// Synchronize to make sure the matrices are loaded
|
// Synchronize to make sure the matrices are loaded
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Multiply the two matrices together;
|
// Multiply the two matrices together;
|
||||||
// each thread computes one element
|
// each thread computes one element
|
||||||
// of the block sub-matrix
|
// of the block sub-matrix
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
|
||||||
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
|
for (size_type k = 0; k < block_size; ++k)
|
||||||
|
Csub += AS(ty, k) * BS(k, tx);
|
||||||
|
|
||||||
// Synchronize to make sure that the preceding
|
// Synchronize to make sure that the preceding
|
||||||
// computation is done before loading two new
|
// computation is done before loading two new
|
||||||
// sub-matrices of A and B in the next iteration
|
// sub-matrices of A and B in the next iteration
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write the block sub-matrix to device memory;
|
// Write the block sub-matrix to device memory;
|
||||||
// each thread writes one element
|
// each thread writes one element
|
||||||
size_type c = wB * block_size * by + block_size * bx;
|
size_type c = wB * block_size * by + block_size * bx;
|
||||||
C[c + wB * ty + tx] = Csub;
|
C[c + wB * ty + tx] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
// C wrappers around our template kernel
|
// C wrappers around our template kernel
|
||||||
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||||
size_t wA, size_t wB) {
|
{
|
||||||
matrixMul<8, size_t>(C, A, B, wA, wB);
|
matrixMul<8, size_t>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||||
size_t wA, size_t wB) {
|
{
|
||||||
matrixMul<16, size_t>(C, A, B, wA, wB);
|
matrixMul<16, size_t>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||||
size_t wA, size_t wB) {
|
{
|
||||||
matrixMul<32, size_t>(C, A, B, wA, wB);
|
matrixMul<32, size_t>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // #ifndef _MATRIXMUL_KERNEL_H_
|
#endif // #ifndef _MATRIXMUL_KERNEL_H_
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cuMemcpyDtoH, cuDeviceGetName, cuParamSeti, cuModuleLoadDataEx, cuModuleGetFunct
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -15,210 +15,211 @@
|
|||||||
|
|
||||||
// With these flags defined, this source file will dynamically
|
// With these flags defined, this source file will dynamically
|
||||||
// load the corresponding functions. Disabled by default.
|
// load the corresponding functions. Disabled by default.
|
||||||
//#define CUDA_INIT_D3D9
|
// #define CUDA_INIT_D3D9
|
||||||
//#define CUDA_INIT_D3D10
|
// #define CUDA_INIT_D3D10
|
||||||
//#define CUDA_INIT_D3D11
|
// #define CUDA_INIT_D3D11
|
||||||
//#define CUDA_INIT_OPENGL
|
// #define CUDA_INIT_OPENGL
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "cuda_drvapi_dynlink.h"
|
#include "cuda_drvapi_dynlink.h"
|
||||||
|
|
||||||
tcuInit *_cuInit;
|
#include <stdio.h>
|
||||||
tcuDriverGetVersion *cuDriverGetVersion;
|
|
||||||
tcuDeviceGet *cuDeviceGet;
|
|
||||||
tcuDeviceGetCount *cuDeviceGetCount;
|
|
||||||
tcuDeviceGetName *cuDeviceGetName;
|
|
||||||
tcuDeviceComputeCapability *cuDeviceComputeCapability;
|
|
||||||
tcuDeviceTotalMem *cuDeviceTotalMem;
|
|
||||||
tcuDeviceGetProperties *cuDeviceGetProperties;
|
|
||||||
tcuDeviceGetAttribute *cuDeviceGetAttribute;
|
|
||||||
tcuGetErrorString *cuGetErrorString;
|
|
||||||
tcuCtxCreate *cuCtxCreate;
|
|
||||||
tcuCtxDestroy *cuCtxDestroy;
|
|
||||||
tcuCtxAttach *cuCtxAttach;
|
|
||||||
tcuCtxDetach *cuCtxDetach;
|
|
||||||
tcuCtxPushCurrent *cuCtxPushCurrent;
|
|
||||||
tcuCtxPopCurrent *cuCtxPopCurrent;
|
|
||||||
tcuCtxGetCurrent *cuCtxGetCurrent;
|
|
||||||
tcuCtxSetCurrent *cuCtxSetCurrent;
|
|
||||||
tcuCtxGetDevice *cuCtxGetDevice;
|
|
||||||
tcuCtxSynchronize *cuCtxSynchronize;
|
|
||||||
tcuModuleLoad *cuModuleLoad;
|
|
||||||
tcuModuleLoadData *cuModuleLoadData;
|
|
||||||
tcuModuleLoadDataEx *cuModuleLoadDataEx;
|
|
||||||
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
|
|
||||||
tcuModuleUnload *cuModuleUnload;
|
|
||||||
tcuModuleGetFunction *cuModuleGetFunction;
|
|
||||||
tcuModuleGetGlobal *cuModuleGetGlobal;
|
|
||||||
tcuModuleGetTexRef *cuModuleGetTexRef;
|
|
||||||
tcuModuleGetSurfRef *cuModuleGetSurfRef;
|
|
||||||
tcuMemGetInfo *cuMemGetInfo;
|
|
||||||
tcuMemAlloc *cuMemAlloc;
|
|
||||||
tcuMemAllocPitch *cuMemAllocPitch;
|
|
||||||
tcuMemFree *cuMemFree;
|
|
||||||
tcuMemGetAddressRange *cuMemGetAddressRange;
|
|
||||||
tcuMemAllocHost *cuMemAllocHost;
|
|
||||||
tcuMemFreeHost *cuMemFreeHost;
|
|
||||||
tcuMemHostAlloc *cuMemHostAlloc;
|
|
||||||
tcuMemHostGetFlags *cuMemHostGetFlags;
|
|
||||||
|
|
||||||
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
|
tcuInit *_cuInit;
|
||||||
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
|
tcuDriverGetVersion *cuDriverGetVersion;
|
||||||
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
|
tcuDeviceGet *cuDeviceGet;
|
||||||
tcuIpcGetEventHandle *cuIpcGetEventHandle;
|
tcuDeviceGetCount *cuDeviceGetCount;
|
||||||
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
|
tcuDeviceGetName *cuDeviceGetName;
|
||||||
tcuIpcGetMemHandle *cuIpcGetMemHandle;
|
tcuDeviceComputeCapability *cuDeviceComputeCapability;
|
||||||
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
|
tcuDeviceTotalMem *cuDeviceTotalMem;
|
||||||
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
|
tcuDeviceGetProperties *cuDeviceGetProperties;
|
||||||
|
tcuDeviceGetAttribute *cuDeviceGetAttribute;
|
||||||
|
tcuGetErrorString *cuGetErrorString;
|
||||||
|
tcuCtxCreate *cuCtxCreate;
|
||||||
|
tcuCtxDestroy *cuCtxDestroy;
|
||||||
|
tcuCtxAttach *cuCtxAttach;
|
||||||
|
tcuCtxDetach *cuCtxDetach;
|
||||||
|
tcuCtxPushCurrent *cuCtxPushCurrent;
|
||||||
|
tcuCtxPopCurrent *cuCtxPopCurrent;
|
||||||
|
tcuCtxGetCurrent *cuCtxGetCurrent;
|
||||||
|
tcuCtxSetCurrent *cuCtxSetCurrent;
|
||||||
|
tcuCtxGetDevice *cuCtxGetDevice;
|
||||||
|
tcuCtxSynchronize *cuCtxSynchronize;
|
||||||
|
tcuModuleLoad *cuModuleLoad;
|
||||||
|
tcuModuleLoadData *cuModuleLoadData;
|
||||||
|
tcuModuleLoadDataEx *cuModuleLoadDataEx;
|
||||||
|
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
|
||||||
|
tcuModuleUnload *cuModuleUnload;
|
||||||
|
tcuModuleGetFunction *cuModuleGetFunction;
|
||||||
|
tcuModuleGetGlobal *cuModuleGetGlobal;
|
||||||
|
tcuModuleGetTexRef *cuModuleGetTexRef;
|
||||||
|
tcuModuleGetSurfRef *cuModuleGetSurfRef;
|
||||||
|
tcuMemGetInfo *cuMemGetInfo;
|
||||||
|
tcuMemAlloc *cuMemAlloc;
|
||||||
|
tcuMemAllocPitch *cuMemAllocPitch;
|
||||||
|
tcuMemFree *cuMemFree;
|
||||||
|
tcuMemGetAddressRange *cuMemGetAddressRange;
|
||||||
|
tcuMemAllocHost *cuMemAllocHost;
|
||||||
|
tcuMemFreeHost *cuMemFreeHost;
|
||||||
|
tcuMemHostAlloc *cuMemHostAlloc;
|
||||||
|
tcuMemHostGetFlags *cuMemHostGetFlags;
|
||||||
|
|
||||||
tcuMemHostRegister *cuMemHostRegister;
|
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
|
||||||
tcuMemHostUnregister *cuMemHostUnregister;
|
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
|
||||||
tcuMemcpyHtoD *cuMemcpyHtoD;
|
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
|
||||||
tcuMemcpyDtoH *cuMemcpyDtoH;
|
tcuIpcGetEventHandle *cuIpcGetEventHandle;
|
||||||
tcuMemcpyDtoD *cuMemcpyDtoD;
|
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
|
||||||
tcuMemcpyDtoA *cuMemcpyDtoA;
|
tcuIpcGetMemHandle *cuIpcGetMemHandle;
|
||||||
tcuMemcpyAtoD *cuMemcpyAtoD;
|
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
|
||||||
tcuMemcpyHtoA *cuMemcpyHtoA;
|
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
|
||||||
tcuMemcpyAtoH *cuMemcpyAtoH;
|
|
||||||
tcuMemcpyAtoA *cuMemcpyAtoA;
|
|
||||||
tcuMemcpy2D *cuMemcpy2D;
|
|
||||||
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
|
|
||||||
tcuMemcpy3D *cuMemcpy3D;
|
|
||||||
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
|
|
||||||
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
|
|
||||||
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
|
|
||||||
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
|
|
||||||
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
|
|
||||||
tcuMemcpy2DAsync *cuMemcpy2DAsync;
|
|
||||||
tcuMemcpy3DAsync *cuMemcpy3DAsync;
|
|
||||||
tcuMemcpy *cuMemcpy;
|
|
||||||
tcuMemcpyPeer *cuMemcpyPeer;
|
|
||||||
tcuMemsetD8 *cuMemsetD8;
|
|
||||||
tcuMemsetD16 *cuMemsetD16;
|
|
||||||
tcuMemsetD32 *cuMemsetD32;
|
|
||||||
tcuMemsetD2D8 *cuMemsetD2D8;
|
|
||||||
tcuMemsetD2D16 *cuMemsetD2D16;
|
|
||||||
tcuMemsetD2D32 *cuMemsetD2D32;
|
|
||||||
tcuFuncSetBlockShape *cuFuncSetBlockShape;
|
|
||||||
tcuFuncSetSharedSize *cuFuncSetSharedSize;
|
|
||||||
tcuFuncGetAttribute *cuFuncGetAttribute;
|
|
||||||
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
|
|
||||||
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
|
|
||||||
tcuLaunchKernel *cuLaunchKernel;
|
|
||||||
tcuArrayCreate *cuArrayCreate;
|
|
||||||
tcuArrayGetDescriptor *cuArrayGetDescriptor;
|
|
||||||
tcuArrayDestroy *cuArrayDestroy;
|
|
||||||
tcuArray3DCreate *cuArray3DCreate;
|
|
||||||
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
|
|
||||||
tcuTexRefCreate *cuTexRefCreate;
|
|
||||||
tcuTexRefDestroy *cuTexRefDestroy;
|
|
||||||
tcuTexRefSetArray *cuTexRefSetArray;
|
|
||||||
tcuTexRefSetAddress *cuTexRefSetAddress;
|
|
||||||
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
|
|
||||||
tcuTexRefSetFormat *cuTexRefSetFormat;
|
|
||||||
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
|
|
||||||
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
|
|
||||||
tcuTexRefSetFlags *cuTexRefSetFlags;
|
|
||||||
tcuTexRefGetAddress *cuTexRefGetAddress;
|
|
||||||
tcuTexRefGetArray *cuTexRefGetArray;
|
|
||||||
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
|
|
||||||
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
|
|
||||||
tcuTexRefGetFormat *cuTexRefGetFormat;
|
|
||||||
tcuTexRefGetFlags *cuTexRefGetFlags;
|
|
||||||
tcuSurfRefSetArray *cuSurfRefSetArray;
|
|
||||||
tcuSurfRefGetArray *cuSurfRefGetArray;
|
|
||||||
tcuParamSetSize *cuParamSetSize;
|
|
||||||
tcuParamSeti *cuParamSeti;
|
|
||||||
tcuParamSetf *cuParamSetf;
|
|
||||||
tcuParamSetv *cuParamSetv;
|
|
||||||
tcuParamSetTexRef *cuParamSetTexRef;
|
|
||||||
tcuLaunch *cuLaunch;
|
|
||||||
tcuLaunchGrid *cuLaunchGrid;
|
|
||||||
tcuLaunchGridAsync *cuLaunchGridAsync;
|
|
||||||
tcuEventCreate *cuEventCreate;
|
|
||||||
tcuEventRecord *cuEventRecord;
|
|
||||||
tcuEventQuery *cuEventQuery;
|
|
||||||
tcuEventSynchronize *cuEventSynchronize;
|
|
||||||
tcuEventDestroy *cuEventDestroy;
|
|
||||||
tcuEventElapsedTime *cuEventElapsedTime;
|
|
||||||
tcuStreamCreate *cuStreamCreate;
|
|
||||||
tcuStreamWaitEvent *cuStreamWaitEvent;
|
|
||||||
tcuStreamAddCallback *cuStreamAddCallback;
|
|
||||||
tcuStreamQuery *cuStreamQuery;
|
|
||||||
tcuStreamSynchronize *cuStreamSynchronize;
|
|
||||||
tcuStreamDestroy *cuStreamDestroy;
|
|
||||||
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
|
|
||||||
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
|
|
||||||
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
|
|
||||||
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
|
|
||||||
tcuGraphicsMapResources *cuGraphicsMapResources;
|
|
||||||
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
|
|
||||||
tcuGetExportTable *cuGetExportTable;
|
|
||||||
tcuCtxSetLimit *cuCtxSetLimit;
|
|
||||||
tcuCtxGetLimit *cuCtxGetLimit;
|
|
||||||
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
|
|
||||||
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
|
|
||||||
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
|
|
||||||
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
|
|
||||||
tcuCtxGetApiVersion *cuCtxGetApiVersion;
|
|
||||||
|
|
||||||
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
|
tcuMemHostRegister *cuMemHostRegister;
|
||||||
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
|
tcuMemHostUnregister *cuMemHostUnregister;
|
||||||
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
|
tcuMemcpyHtoD *cuMemcpyHtoD;
|
||||||
|
tcuMemcpyDtoH *cuMemcpyDtoH;
|
||||||
|
tcuMemcpyDtoD *cuMemcpyDtoD;
|
||||||
|
tcuMemcpyDtoA *cuMemcpyDtoA;
|
||||||
|
tcuMemcpyAtoD *cuMemcpyAtoD;
|
||||||
|
tcuMemcpyHtoA *cuMemcpyHtoA;
|
||||||
|
tcuMemcpyAtoH *cuMemcpyAtoH;
|
||||||
|
tcuMemcpyAtoA *cuMemcpyAtoA;
|
||||||
|
tcuMemcpy2D *cuMemcpy2D;
|
||||||
|
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
|
||||||
|
tcuMemcpy3D *cuMemcpy3D;
|
||||||
|
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
|
||||||
|
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
|
||||||
|
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
|
||||||
|
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
|
||||||
|
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
|
||||||
|
tcuMemcpy2DAsync *cuMemcpy2DAsync;
|
||||||
|
tcuMemcpy3DAsync *cuMemcpy3DAsync;
|
||||||
|
tcuMemcpy *cuMemcpy;
|
||||||
|
tcuMemcpyPeer *cuMemcpyPeer;
|
||||||
|
tcuMemsetD8 *cuMemsetD8;
|
||||||
|
tcuMemsetD16 *cuMemsetD16;
|
||||||
|
tcuMemsetD32 *cuMemsetD32;
|
||||||
|
tcuMemsetD2D8 *cuMemsetD2D8;
|
||||||
|
tcuMemsetD2D16 *cuMemsetD2D16;
|
||||||
|
tcuMemsetD2D32 *cuMemsetD2D32;
|
||||||
|
tcuFuncSetBlockShape *cuFuncSetBlockShape;
|
||||||
|
tcuFuncSetSharedSize *cuFuncSetSharedSize;
|
||||||
|
tcuFuncGetAttribute *cuFuncGetAttribute;
|
||||||
|
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
|
||||||
|
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
|
||||||
|
tcuLaunchKernel *cuLaunchKernel;
|
||||||
|
tcuArrayCreate *cuArrayCreate;
|
||||||
|
tcuArrayGetDescriptor *cuArrayGetDescriptor;
|
||||||
|
tcuArrayDestroy *cuArrayDestroy;
|
||||||
|
tcuArray3DCreate *cuArray3DCreate;
|
||||||
|
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
|
||||||
|
tcuTexRefCreate *cuTexRefCreate;
|
||||||
|
tcuTexRefDestroy *cuTexRefDestroy;
|
||||||
|
tcuTexRefSetArray *cuTexRefSetArray;
|
||||||
|
tcuTexRefSetAddress *cuTexRefSetAddress;
|
||||||
|
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
|
||||||
|
tcuTexRefSetFormat *cuTexRefSetFormat;
|
||||||
|
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
|
||||||
|
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
|
||||||
|
tcuTexRefSetFlags *cuTexRefSetFlags;
|
||||||
|
tcuTexRefGetAddress *cuTexRefGetAddress;
|
||||||
|
tcuTexRefGetArray *cuTexRefGetArray;
|
||||||
|
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
|
||||||
|
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
|
||||||
|
tcuTexRefGetFormat *cuTexRefGetFormat;
|
||||||
|
tcuTexRefGetFlags *cuTexRefGetFlags;
|
||||||
|
tcuSurfRefSetArray *cuSurfRefSetArray;
|
||||||
|
tcuSurfRefGetArray *cuSurfRefGetArray;
|
||||||
|
tcuParamSetSize *cuParamSetSize;
|
||||||
|
tcuParamSeti *cuParamSeti;
|
||||||
|
tcuParamSetf *cuParamSetf;
|
||||||
|
tcuParamSetv *cuParamSetv;
|
||||||
|
tcuParamSetTexRef *cuParamSetTexRef;
|
||||||
|
tcuLaunch *cuLaunch;
|
||||||
|
tcuLaunchGrid *cuLaunchGrid;
|
||||||
|
tcuLaunchGridAsync *cuLaunchGridAsync;
|
||||||
|
tcuEventCreate *cuEventCreate;
|
||||||
|
tcuEventRecord *cuEventRecord;
|
||||||
|
tcuEventQuery *cuEventQuery;
|
||||||
|
tcuEventSynchronize *cuEventSynchronize;
|
||||||
|
tcuEventDestroy *cuEventDestroy;
|
||||||
|
tcuEventElapsedTime *cuEventElapsedTime;
|
||||||
|
tcuStreamCreate *cuStreamCreate;
|
||||||
|
tcuStreamWaitEvent *cuStreamWaitEvent;
|
||||||
|
tcuStreamAddCallback *cuStreamAddCallback;
|
||||||
|
tcuStreamQuery *cuStreamQuery;
|
||||||
|
tcuStreamSynchronize *cuStreamSynchronize;
|
||||||
|
tcuStreamDestroy *cuStreamDestroy;
|
||||||
|
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
|
||||||
|
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
|
||||||
|
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
|
||||||
|
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
|
||||||
|
tcuGraphicsMapResources *cuGraphicsMapResources;
|
||||||
|
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
|
||||||
|
tcuGetExportTable *cuGetExportTable;
|
||||||
|
tcuCtxSetLimit *cuCtxSetLimit;
|
||||||
|
tcuCtxGetLimit *cuCtxGetLimit;
|
||||||
|
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
|
||||||
|
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
|
||||||
|
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
|
||||||
|
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
|
||||||
|
tcuCtxGetApiVersion *cuCtxGetApiVersion;
|
||||||
|
|
||||||
tcuProfilerStop *cuProfilerStop;
|
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
|
||||||
|
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
|
||||||
|
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
|
||||||
|
|
||||||
|
tcuProfilerStop *cuProfilerStop;
|
||||||
|
|
||||||
#ifdef CUDA_INIT_D3D9
|
#ifdef CUDA_INIT_D3D9
|
||||||
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
|
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
|
||||||
// are deprecated; please use the ones below
|
// are deprecated; please use the ones below
|
||||||
tcuD3D9Begin *cuD3D9Begin;
|
tcuD3D9Begin *cuD3D9Begin;
|
||||||
tcuD3D9End *cuD3DEnd;
|
tcuD3D9End *cuD3DEnd;
|
||||||
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
|
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
|
||||||
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
|
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
|
||||||
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
|
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
|
||||||
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
|
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
|
||||||
|
|
||||||
// D3D9/CUDA interop (CUDA 2.x compatible)
|
// D3D9/CUDA interop (CUDA 2.x compatible)
|
||||||
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
|
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
|
||||||
tcuD3D9RegisterResource *cuD3D9RegisterResource;
|
tcuD3D9RegisterResource *cuD3D9RegisterResource;
|
||||||
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
|
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
|
||||||
tcuD3D9MapResources *cuD3D9MapResources;
|
tcuD3D9MapResources *cuD3D9MapResources;
|
||||||
tcuD3D9UnmapResources *cuD3D9UnmapResources;
|
tcuD3D9UnmapResources *cuD3D9UnmapResources;
|
||||||
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
|
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
|
||||||
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
|
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
|
||||||
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
|
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
|
||||||
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
|
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
|
||||||
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
|
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
|
||||||
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
|
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
|
||||||
|
|
||||||
// D3D9/CUDA interop (CUDA 2.0+)
|
// D3D9/CUDA interop (CUDA 2.0+)
|
||||||
tcuD3D9GetDevice *cuD3D9GetDevice;
|
tcuD3D9GetDevice *cuD3D9GetDevice;
|
||||||
tcuD3D9CtxCreate *cuD3D9CtxCreate;
|
tcuD3D9CtxCreate *cuD3D9CtxCreate;
|
||||||
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
|
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef CUDA_INIT_D3D10
|
#ifdef CUDA_INIT_D3D10
|
||||||
// D3D10/CUDA interop (CUDA 3.0+)
|
// D3D10/CUDA interop (CUDA 3.0+)
|
||||||
tcuD3D10GetDevice *cuD3D10GetDevice;
|
tcuD3D10GetDevice *cuD3D10GetDevice;
|
||||||
tcuD3D10CtxCreate *cuD3D10CtxCreate;
|
tcuD3D10CtxCreate *cuD3D10CtxCreate;
|
||||||
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
|
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifdef CUDA_INIT_D3D11
|
#ifdef CUDA_INIT_D3D11
|
||||||
// D3D11/CUDA interop (CUDA 3.0+)
|
// D3D11/CUDA interop (CUDA 3.0+)
|
||||||
tcuD3D11GetDevice *cuD3D11GetDevice;
|
tcuD3D11GetDevice *cuD3D11GetDevice;
|
||||||
tcuD3D11CtxCreate *cuD3D11CtxCreate;
|
tcuD3D11CtxCreate *cuD3D11CtxCreate;
|
||||||
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
|
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// GL/CUDA interop
|
// GL/CUDA interop
|
||||||
#ifdef CUDA_INIT_OPENGL
|
#ifdef CUDA_INIT_OPENGL
|
||||||
tcuGLCtxCreate *cuGLCtxCreate;
|
tcuGLCtxCreate *cuGLCtxCreate;
|
||||||
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
|
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
|
||||||
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
|
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
tcuWGLGetDevice *cuWGLGetDevice;
|
tcuWGLGetDevice *cuWGLGetDevice;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
{
|
{
|
||||||
*pInstance = LoadLibrary(__CudaLibName);
|
*pInstance = LoadLibrary(__CudaLibName);
|
||||||
|
|
||||||
if (*pInstance == NULL)
|
if (*pInstance == NULL) {
|
||||||
{
|
|
||||||
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
|
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
|
||||||
return CUDA_ERROR_UNKNOWN;
|
return CUDA_ERROR_UNKNOWN;
|
||||||
}
|
}
|
||||||
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
return CUDA_SUCCESS;
|
return CUDA_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX(name, alias, required) \
|
#define GET_PROC_EX(name, alias, required) \
|
||||||
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
|
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
|
||||||
#name, __CudaLibName); \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V2(name, alias, required) \
|
#define GET_PROC_EX_V2(name, alias, required) \
|
||||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
|
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
|
||||||
STRINGIFY(name##_v2), __CudaLibName); \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V3(name, alias, required) \
|
#define GET_PROC_EX_V3(name, alias, required) \
|
||||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
|
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
|
||||||
STRINGIFY(name##_v3), __CudaLibName); \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
|
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
|
||||||
|
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
|
|
||||||
#if defined(__APPLE__) || defined(__MACOSX)
|
#if defined(__APPLE__) || defined(__MACOSX)
|
||||||
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
|
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
|
||||||
#elif defined(__ANDROID__)
|
#elif defined(__ANDROID__)
|
||||||
#if defined (__aarch64__)
|
#if defined(__aarch64__)
|
||||||
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
|
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
|
||||||
#elif defined(__arm__)
|
#elif defined(__arm__)
|
||||||
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
|
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
|
||||||
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
{
|
{
|
||||||
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
|
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
|
||||||
|
|
||||||
if (*pInstance == NULL)
|
if (*pInstance == NULL) {
|
||||||
{
|
|
||||||
printf("dlopen \"%s\" failed!\n", __CudaLibName);
|
printf("dlopen \"%s\" failed!\n", __CudaLibName);
|
||||||
return CUDA_ERROR_UNKNOWN;
|
return CUDA_ERROR_UNKNOWN;
|
||||||
}
|
}
|
||||||
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
return CUDA_SUCCESS;
|
return CUDA_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX(name, alias, required) \
|
#define GET_PROC_EX(name, alias, required) \
|
||||||
alias = (t##name *)dlsym(CudaDrvLib, #name); \
|
alias = (t##name *)dlsym(CudaDrvLib, #name); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
|
||||||
#name, __CudaLibName); \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V2(name, alias, required) \
|
#define GET_PROC_EX_V2(name, alias, required) \
|
||||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
|
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
|
||||||
STRINGIFY(name##_v2), __CudaLibName); \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V3(name, alias, required) \
|
#define GET_PROC_EX_V3(name, alias, required) \
|
||||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
|
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
|
||||||
STRINGIFY(name##_v3), __CudaLibName); \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#error unsupported platform
|
#error unsupported platform
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CHECKED_CALL(call) \
|
#define CHECKED_CALL(call) \
|
||||||
do { \
|
do { \
|
||||||
CUresult result = (call); \
|
CUresult result = (call); \
|
||||||
if (CUDA_SUCCESS != result) { \
|
if (CUDA_SUCCESS != result) { \
|
||||||
return result; \
|
return result; \
|
||||||
} \
|
} \
|
||||||
} while(0)
|
} while (0)
|
||||||
|
|
||||||
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
|
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
|
||||||
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
|
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
|
||||||
#define GET_PROC(name) GET_PROC_REQUIRED(name)
|
#define GET_PROC(name) GET_PROC_REQUIRED(name)
|
||||||
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
|
#define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
|
||||||
#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1)
|
#define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)
|
||||||
|
|
||||||
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||||
{
|
{
|
||||||
CUDADRIVER CudaDrvLib;
|
CUDADRIVER CudaDrvLib;
|
||||||
int driverVer = 1000;
|
int driverVer = 1000;
|
||||||
|
|
||||||
CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
|
CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
|
||||||
|
|
||||||
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
// available since 2.2. if not present, version 1.0 is assumed
|
// available since 2.2. if not present, version 1.0 is assumed
|
||||||
GET_PROC_OPTIONAL(cuDriverGetVersion);
|
GET_PROC_OPTIONAL(cuDriverGetVersion);
|
||||||
|
|
||||||
if (cuDriverGetVersion)
|
if (cuDriverGetVersion) {
|
||||||
{
|
|
||||||
CHECKED_CALL(cuDriverGetVersion(&driverVer));
|
CHECKED_CALL(cuDriverGetVersion(&driverVer));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuStreamDestroy);
|
GET_PROC(cuStreamDestroy);
|
||||||
|
|
||||||
// These are CUDA 5.0 new functions
|
// These are CUDA 5.0 new functions
|
||||||
if (driverVer >= 5000)
|
if (driverVer >= 5000) {
|
||||||
{
|
|
||||||
GET_PROC(cuMipmappedArrayCreate);
|
GET_PROC(cuMipmappedArrayCreate);
|
||||||
GET_PROC(cuMipmappedArrayDestroy);
|
GET_PROC(cuMipmappedArrayDestroy);
|
||||||
GET_PROC(cuMipmappedArrayGetLevel);
|
GET_PROC(cuMipmappedArrayGetLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// These are CUDA 4.2 new functions
|
// These are CUDA 4.2 new functions
|
||||||
if (driverVer >= 4020)
|
if (driverVer >= 4020) {
|
||||||
{
|
|
||||||
GET_PROC(cuFuncSetSharedMemConfig);
|
GET_PROC(cuFuncSetSharedMemConfig);
|
||||||
GET_PROC(cuCtxGetSharedMemConfig);
|
GET_PROC(cuCtxGetSharedMemConfig);
|
||||||
GET_PROC(cuCtxSetSharedMemConfig);
|
GET_PROC(cuCtxSetSharedMemConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
// These are CUDA 4.1 new functions
|
// These are CUDA 4.1 new functions
|
||||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
|
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
|
||||||
{
|
|
||||||
GET_PROC(cuDeviceGetByPCIBusId);
|
GET_PROC(cuDeviceGetByPCIBusId);
|
||||||
GET_PROC(cuDeviceGetPCIBusId);
|
GET_PROC(cuDeviceGetPCIBusId);
|
||||||
GET_PROC(cuIpcGetEventHandle);
|
GET_PROC(cuIpcGetEventHandle);
|
||||||
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// These could be _v2 interfaces
|
// These could be _v2 interfaces
|
||||||
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
|
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuCtxDestroy);
|
GET_PROC_V2(cuCtxDestroy);
|
||||||
GET_PROC_V2(cuCtxPopCurrent);
|
GET_PROC_V2(cuCtxPopCurrent);
|
||||||
GET_PROC_V2(cuCtxPushCurrent);
|
GET_PROC_V2(cuCtxPushCurrent);
|
||||||
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC_V2(cuEventDestroy);
|
GET_PROC_V2(cuEventDestroy);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
|
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuDeviceTotalMem);
|
GET_PROC_V2(cuDeviceTotalMem);
|
||||||
GET_PROC_V2(cuCtxCreate);
|
GET_PROC_V2(cuCtxCreate);
|
||||||
GET_PROC_V2(cuModuleGetGlobal);
|
GET_PROC_V2(cuModuleGetGlobal);
|
||||||
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC_V2(cuTexRefSetAddress);
|
GET_PROC_V2(cuTexRefSetAddress);
|
||||||
GET_PROC_V2(cuTexRefGetAddress);
|
GET_PROC_V2(cuTexRefGetAddress);
|
||||||
|
|
||||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
|
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
|
||||||
{
|
|
||||||
GET_PROC_V3(cuTexRefSetAddress2D);
|
GET_PROC_V3(cuTexRefSetAddress2D);
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuTexRefSetAddress2D);
|
GET_PROC_V2(cuTexRefSetAddress2D);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// versions earlier than 3020
|
// versions earlier than 3020
|
||||||
GET_PROC(cuDeviceTotalMem);
|
GET_PROC(cuDeviceTotalMem);
|
||||||
GET_PROC(cuCtxCreate);
|
GET_PROC(cuCtxCreate);
|
||||||
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// The following functions are specific to CUDA versions
|
// The following functions are specific to CUDA versions
|
||||||
if (driverVer >= 4000)
|
if (driverVer >= 4000) {
|
||||||
{
|
|
||||||
GET_PROC(cuCtxSetCurrent);
|
GET_PROC(cuCtxSetCurrent);
|
||||||
GET_PROC(cuCtxGetCurrent);
|
GET_PROC(cuCtxGetCurrent);
|
||||||
GET_PROC(cuMemHostRegister);
|
GET_PROC(cuMemHostRegister);
|
||||||
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuProfilerStop);
|
GET_PROC(cuProfilerStop);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 3010)
|
if (driverVer >= 3010) {
|
||||||
{
|
|
||||||
GET_PROC(cuModuleGetSurfRef);
|
GET_PROC(cuModuleGetSurfRef);
|
||||||
GET_PROC(cuSurfRefSetArray);
|
GET_PROC(cuSurfRefSetArray);
|
||||||
GET_PROC(cuSurfRefGetArray);
|
GET_PROC(cuSurfRefGetArray);
|
||||||
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuCtxGetLimit);
|
GET_PROC(cuCtxGetLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 3000)
|
if (driverVer >= 3000) {
|
||||||
{
|
|
||||||
GET_PROC(cuMemcpyDtoDAsync);
|
GET_PROC(cuMemcpyDtoDAsync);
|
||||||
GET_PROC(cuFuncSetCacheConfig);
|
GET_PROC(cuFuncSetCacheConfig);
|
||||||
#ifdef CUDA_INIT_D3D11
|
#ifdef CUDA_INIT_D3D11
|
||||||
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuGraphicsUnregisterResource);
|
GET_PROC(cuGraphicsUnregisterResource);
|
||||||
GET_PROC(cuGraphicsSubResourceGetMappedArray);
|
GET_PROC(cuGraphicsSubResourceGetMappedArray);
|
||||||
|
|
||||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
|
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
|
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
GET_PROC(cuGraphicsResourceGetMappedPointer);
|
GET_PROC(cuGraphicsResourceGetMappedPointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuGetExportTable);
|
GET_PROC(cuGetExportTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 2030)
|
if (driverVer >= 2030) {
|
||||||
{
|
|
||||||
GET_PROC(cuMemHostGetFlags);
|
GET_PROC(cuMemHostGetFlags);
|
||||||
#ifdef CUDA_INIT_D3D10
|
#ifdef CUDA_INIT_D3D10
|
||||||
GET_PROC(cuD3D10GetDevice);
|
GET_PROC(cuD3D10GetDevice);
|
||||||
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 2010)
|
if (driverVer >= 2010) {
|
||||||
{
|
|
||||||
GET_PROC(cuModuleLoadDataEx);
|
GET_PROC(cuModuleLoadDataEx);
|
||||||
GET_PROC(cuModuleLoadFatBinary);
|
GET_PROC(cuModuleLoadFatBinary);
|
||||||
#ifdef CUDA_INIT_OPENGL
|
#ifdef CUDA_INIT_OPENGL
|
||||||
GET_PROC(cuGLCtxCreate);
|
GET_PROC(cuGLCtxCreate);
|
||||||
GET_PROC(cuGraphicsGLRegisterBuffer);
|
GET_PROC(cuGraphicsGLRegisterBuffer);
|
||||||
GET_PROC(cuGraphicsGLRegisterImage);
|
GET_PROC(cuGraphicsGLRegisterImage);
|
||||||
# ifdef WIN32
|
#ifdef WIN32
|
||||||
GET_PROC(cuWGLGetDevice);
|
GET_PROC(cuWGLGetDevice);
|
||||||
# endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#ifdef CUDA_INIT_D3D9
|
#ifdef CUDA_INIT_D3D9
|
||||||
GET_PROC(cuD3D9GetDevice);
|
GET_PROC(cuD3D9GetDevice);
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -14,21 +14,17 @@
|
|||||||
#ifndef HELPER_CUDA_DRVAPI_H
|
#ifndef HELPER_CUDA_DRVAPI_H
|
||||||
#define HELPER_CUDA_DRVAPI_H
|
#define HELPER_CUDA_DRVAPI_H
|
||||||
|
|
||||||
|
#include <helper_string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include <helper_string.h>
|
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(a, b) (a > b ? a : b)
|
#define MAX(a, b) (a > b ? a : b)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HELPER_CUDA_DRVAPI_H
|
#ifndef HELPER_CUDA_DRVAPI_H
|
||||||
inline int ftoi(float value) {
|
inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
|
||||||
return (value >= 0 ? static_cast<int>(value + 0.5)
|
|
||||||
: static_cast<int>(value - 0.5));
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef EXIT_WAIVED
|
#ifndef EXIT_WAIVED
|
||||||
@ -47,311 +43,302 @@ inline int ftoi(float value) {
|
|||||||
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
|
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
|
||||||
|
|
||||||
// These are the inline versions for all of the SDK helper functions
|
// These are the inline versions for all of the SDK helper functions
|
||||||
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
|
||||||
if (CUDA_SUCCESS != err) {
|
{
|
||||||
const char *errorStr = NULL;
|
if (CUDA_SUCCESS != err) {
|
||||||
cuGetErrorString(err, &errorStr);
|
const char *errorStr = NULL;
|
||||||
fprintf(stderr,
|
cuGetErrorString(err, &errorStr);
|
||||||
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
|
fprintf(stderr,
|
||||||
"line %i.\n",
|
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
|
||||||
err, errorStr, file, line);
|
"line %i.\n",
|
||||||
exit(EXIT_FAILURE);
|
err,
|
||||||
}
|
errorStr,
|
||||||
|
file,
|
||||||
|
line);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// This function wraps the CUDA Driver API into a template function
|
// This function wraps the CUDA Driver API into a template function
|
||||||
template <class T>
|
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
|
{
|
||||||
int device) {
|
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Beginning of GPU Architecture definitions
|
// Beginning of GPU Architecture definitions
|
||||||
inline int _ConvertSMVer2CoresDRV(int major, int minor) {
|
inline int _ConvertSMVer2CoresDRV(int major, int minor)
|
||||||
// Defines for GPU Architecture types (using the SM version to determine the #
|
{
|
||||||
// of cores per SM
|
// Defines for GPU Architecture types (using the SM version to determine the #
|
||||||
typedef struct {
|
// of cores per SM
|
||||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
|
typedef struct
|
||||||
// minor version
|
{
|
||||||
int Cores;
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
|
||||||
} sSMtoCores;
|
// minor version
|
||||||
|
int Cores;
|
||||||
|
} sSMtoCores;
|
||||||
|
|
||||||
sSMtoCores nGpuArchCoresPerSM[] = {
|
sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
|
||||||
{0x30, 192},
|
{0x32, 192},
|
||||||
{0x32, 192},
|
{0x35, 192},
|
||||||
{0x35, 192},
|
{0x37, 192},
|
||||||
{0x37, 192},
|
{0x50, 128},
|
||||||
{0x50, 128},
|
{0x52, 128},
|
||||||
{0x52, 128},
|
{0x53, 128},
|
||||||
{0x53, 128},
|
{0x60, 64},
|
||||||
{0x60, 64},
|
{0x61, 128},
|
||||||
{0x61, 128},
|
{0x62, 128},
|
||||||
{0x62, 128},
|
{0x70, 64},
|
||||||
{0x70, 64},
|
{0x72, 64},
|
||||||
{0x72, 64},
|
{0x75, 64},
|
||||||
{0x75, 64},
|
{0x80, 64},
|
||||||
{0x80, 64},
|
{0x86, 128},
|
||||||
{0x86, 128},
|
{0x87, 128},
|
||||||
{0x87, 128},
|
{0x90, 128},
|
||||||
{0x90, 128},
|
{-1, -1}};
|
||||||
{-1, -1}};
|
|
||||||
|
|
||||||
int index = 0;
|
int index = 0;
|
||||||
|
|
||||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||||||
return nGpuArchCoresPerSM[index].Cores;
|
return nGpuArchCoresPerSM[index].Cores;
|
||||||
|
}
|
||||||
|
|
||||||
|
index++;
|
||||||
}
|
}
|
||||||
|
|
||||||
index++;
|
// If we don't find the values, we default use the previous one to run
|
||||||
}
|
// properly
|
||||||
|
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
||||||
// If we don't find the values, we default use the previous one to run
|
major,
|
||||||
// properly
|
minor,
|
||||||
printf(
|
nGpuArchCoresPerSM[index - 1].Cores);
|
||||||
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
return nGpuArchCoresPerSM[index - 1].Cores;
|
||||||
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
|
|
||||||
return nGpuArchCoresPerSM[index - 1].Cores;
|
|
||||||
}
|
}
|
||||||
// end of GPU Architecture definitions
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
#ifdef __cuda_cuda_h__
|
#ifdef __cuda_cuda_h__
|
||||||
// General GPU Device CUDA Initialization
|
// General GPU Device CUDA Initialization
|
||||||
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
|
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
|
||||||
int cuDevice = 0;
|
{
|
||||||
int deviceCount = 0;
|
int cuDevice = 0;
|
||||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
int deviceCount = 0;
|
||||||
|
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||||
|
|
||||||
if (deviceCount == 0) {
|
if (deviceCount == 0) {
|
||||||
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
|
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int dev = 0;
|
int dev = 0;
|
||||||
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
|
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
|
||||||
|
|
||||||
if (dev < 0) {
|
if (dev < 0) {
|
||||||
dev = 0;
|
dev = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dev > deviceCount - 1) {
|
if (dev > deviceCount - 1) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
|
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
|
||||||
deviceCount);
|
fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
|
||||||
fprintf(stderr,
|
fprintf(stderr, "\n");
|
||||||
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
|
return -dev;
|
||||||
dev);
|
}
|
||||||
fprintf(stderr, "\n");
|
|
||||||
return -dev;
|
|
||||||
}
|
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
|
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
|
||||||
char name[100];
|
char name[100];
|
||||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||||
|
|
||||||
int computeMode;
|
int computeMode;
|
||||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
||||||
|
|
||||||
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
|
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
|
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
|
||||||
"threads can use this CUDA Device.\n");
|
"threads can use this CUDA Device.\n");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
|
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
|
||||||
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
|
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
|
||||||
}
|
}
|
||||||
|
|
||||||
return dev;
|
return dev;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This function returns the best GPU based on performance
|
// This function returns the best GPU based on performance
|
||||||
inline int gpuGetMaxGflopsDeviceIdDRV() {
|
inline int gpuGetMaxGflopsDeviceIdDRV()
|
||||||
CUdevice current_device = 0;
|
{
|
||||||
CUdevice max_perf_device = 0;
|
CUdevice current_device = 0;
|
||||||
int device_count = 0;
|
CUdevice max_perf_device = 0;
|
||||||
int sm_per_multiproc = 0;
|
int device_count = 0;
|
||||||
unsigned long long max_compute_perf = 0;
|
int sm_per_multiproc = 0;
|
||||||
int major = 0;
|
unsigned long long max_compute_perf = 0;
|
||||||
int minor = 0;
|
int major = 0;
|
||||||
int multiProcessorCount;
|
int minor = 0;
|
||||||
int clockRate;
|
int multiProcessorCount;
|
||||||
int devices_prohibited = 0;
|
int clockRate;
|
||||||
|
int devices_prohibited = 0;
|
||||||
|
|
||||||
cuInit(0, __CUDA_API_VERSION);
|
cuInit(0, __CUDA_API_VERSION);
|
||||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||||
|
|
||||||
if (device_count == 0) {
|
if (device_count == 0) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
||||||
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
exit(EXIT_FAILURE);
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the best CUDA capable GPU device
|
|
||||||
current_device = 0;
|
|
||||||
|
|
||||||
while (current_device < device_count) {
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
|
||||||
current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
|
||||||
|
|
||||||
int computeMode;
|
|
||||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
|
|
||||||
current_device);
|
|
||||||
|
|
||||||
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
|
|
||||||
if (major == 9999 && minor == 9999) {
|
|
||||||
sm_per_multiproc = 1;
|
|
||||||
} else {
|
|
||||||
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned long long compute_perf =
|
|
||||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
|
||||||
clockRate);
|
|
||||||
|
|
||||||
if (compute_perf > max_compute_perf) {
|
|
||||||
max_compute_perf = compute_perf;
|
|
||||||
max_perf_device = current_device;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
devices_prohibited++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
++current_device;
|
// Find the best CUDA capable GPU device
|
||||||
}
|
current_device = 0;
|
||||||
|
|
||||||
if (devices_prohibited == device_count) {
|
while (current_device < device_count) {
|
||||||
fprintf(stderr,
|
checkCudaErrors(
|
||||||
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
|
cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
|
||||||
"prohibited.\n");
|
checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
|
||||||
exit(EXIT_FAILURE);
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||||
}
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||||
|
|
||||||
return max_perf_device;
|
int computeMode;
|
||||||
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
|
||||||
|
|
||||||
|
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
|
||||||
|
if (major == 9999 && minor == 9999) {
|
||||||
|
sm_per_multiproc = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
|
||||||
|
|
||||||
|
if (compute_perf > max_compute_perf) {
|
||||||
|
max_compute_perf = compute_perf;
|
||||||
|
max_perf_device = current_device;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
devices_prohibited++;
|
||||||
|
}
|
||||||
|
|
||||||
|
++current_device;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (devices_prohibited == device_count) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
|
||||||
|
"prohibited.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return max_perf_device;
|
||||||
}
|
}
|
||||||
|
|
||||||
// General initialization call to pick the best CUDA Device
|
// General initialization call to pick the best CUDA Device
|
||||||
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
|
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
|
||||||
CUdevice cuDevice;
|
{
|
||||||
int devID = 0;
|
CUdevice cuDevice;
|
||||||
|
int devID = 0;
|
||||||
|
|
||||||
// If the command-line has a device number specified, use it
|
// If the command-line has a device number specified, use it
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||||||
devID = gpuDeviceInitDRV(argc, argv);
|
devID = gpuDeviceInitDRV(argc, argv);
|
||||||
|
|
||||||
if (devID < 0) {
|
if (devID < 0) {
|
||||||
printf("exiting...\n");
|
printf("exiting...\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// Otherwise pick the device with highest Gflops/s
|
||||||
|
char name[100];
|
||||||
|
devID = gpuGetMaxGflopsDeviceIdDRV();
|
||||||
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||||
|
cuDeviceGetName(name, 100, cuDevice);
|
||||||
|
printf("> Using CUDA Device [%d]: %s\n", devID, name);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Otherwise pick the device with highest Gflops/s
|
|
||||||
char name[100];
|
|
||||||
devID = gpuGetMaxGflopsDeviceIdDRV();
|
|
||||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
|
||||||
cuDeviceGetName(name, 100, cuDevice);
|
|
||||||
printf("> Using CUDA Device [%d]: %s\n", devID, name);
|
|
||||||
}
|
|
||||||
|
|
||||||
cuDeviceGet(&cuDevice, devID);
|
cuDeviceGet(&cuDevice, devID);
|
||||||
|
|
||||||
return cuDevice;
|
return cuDevice;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline CUdevice findIntegratedGPUDrv() {
|
inline CUdevice findIntegratedGPUDrv()
|
||||||
CUdevice current_device = 0;
|
{
|
||||||
int device_count = 0;
|
CUdevice current_device = 0;
|
||||||
int devices_prohibited = 0;
|
int device_count = 0;
|
||||||
int isIntegrated;
|
int devices_prohibited = 0;
|
||||||
|
int isIntegrated;
|
||||||
|
|
||||||
cuInit(0, __CUDA_API_VERSION);
|
cuInit(0, __CUDA_API_VERSION);
|
||||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||||
|
|
||||||
if (device_count == 0) {
|
if (device_count == 0) {
|
||||||
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
|
||||||
|
|
||||||
// Find the integrated GPU which is compute capable
|
|
||||||
while (current_device < device_count) {
|
|
||||||
int computeMode = -1;
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
|
|
||||||
|
|
||||||
// If GPU is integrated and is not running on Compute Mode prohibited use
|
|
||||||
// that
|
|
||||||
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
|
|
||||||
int major = 0, minor = 0;
|
|
||||||
char deviceName[256];
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
|
||||||
current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
|
||||||
current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
|
||||||
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
|
|
||||||
current_device, deviceName, major, minor);
|
|
||||||
|
|
||||||
return current_device;
|
|
||||||
} else {
|
|
||||||
devices_prohibited++;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
current_device++;
|
// Find the integrated GPU which is compute capable
|
||||||
}
|
while (current_device < device_count) {
|
||||||
|
int computeMode = -1;
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
|
||||||
|
|
||||||
if (devices_prohibited == device_count) {
|
// If GPU is integrated and is not running on Compute Mode prohibited use
|
||||||
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
|
// that
|
||||||
exit(EXIT_FAILURE);
|
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
|
||||||
}
|
int major = 0, minor = 0;
|
||||||
|
char deviceName[256];
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||||
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||||
|
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
||||||
|
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
|
||||||
|
|
||||||
return -1;
|
return current_device;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
devices_prohibited++;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_device++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (devices_prohibited == device_count) {
|
||||||
|
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// General check for CUDA GPU SM Capabilities
|
// General check for CUDA GPU SM Capabilities
|
||||||
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
|
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
|
||||||
int devID) {
|
{
|
||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
char name[256];
|
char name[256];
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
|
||||||
|
|
||||||
if ((major > major_version) ||
|
if ((major > major_version) || (major == major_version && minor >= minor_version)) {
|
||||||
(major == major_version && minor >= minor_version)) {
|
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
|
||||||
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
|
return true;
|
||||||
major, minor);
|
}
|
||||||
return true;
|
else {
|
||||||
} else {
|
printf("No GPU device was found that can support CUDA compute capability "
|
||||||
printf(
|
"%d.%d.\n",
|
||||||
"No GPU device was found that can support CUDA compute capability "
|
major_version,
|
||||||
"%d.%d.\n",
|
minor_version);
|
||||||
major_version, minor_version);
|
return false;
|
||||||
return false;
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// end of CUDA Helper Functions
|
// end of CUDA Helper Functions
|
||||||
|
|
||||||
#endif // HELPER_CUDA_DRVAPI_H
|
|
||||||
|
|
||||||
|
#endif // HELPER_CUDA_DRVAPI_H
|
||||||
|
@ -34,8 +34,8 @@
|
|||||||
#define WA (4 * block_size) // Matrix A width
|
#define WA (4 * block_size) // Matrix A width
|
||||||
#define HA (6 * block_size) // Matrix A height
|
#define HA (6 * block_size) // Matrix A height
|
||||||
#define WB (4 * block_size) // Matrix B width
|
#define WB (4 * block_size) // Matrix B width
|
||||||
#define HB WA // Matrix B height
|
#define HB WA // Matrix B height
|
||||||
#define WC WB // Matrix C width
|
#define WC WB // Matrix C width
|
||||||
#define HC HA // Matrix C height
|
#define HC HA // Matrix C height
|
||||||
|
|
||||||
#endif // _MATRIXMUL_H_
|
#endif // _MATRIXMUL_H_
|
||||||
|
@ -43,10 +43,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, CUDA
|
// includes, CUDA
|
||||||
#include "cuda_drvapi_dynlink.h"
|
#include "cuda_drvapi_dynlink.h"
|
||||||
@ -60,7 +60,7 @@
|
|||||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||||
|
|
||||||
#if defined _MSC_VER
|
#if defined _MSC_VER
|
||||||
#pragma warning (disable : 4312)
|
#pragma warning(disable : 4312)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
|||||||
// Globals
|
// Globals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
CUcontext g_cuContext;
|
CUcontext g_cuContext;
|
||||||
bool noprompt = false;
|
bool noprompt = false;
|
||||||
|
|
||||||
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
|
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
|
||||||
|
|
||||||
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void randomInit(float *data, size_t size)
|
void randomInit(float *data, size_t size)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i) {
|
||||||
{
|
|
||||||
data[i] = rand() / (float)RAND_MAX;
|
data[i] = rand() / (float)RAND_MAX;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
|
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
|
||||||
{
|
{
|
||||||
CUresult status;
|
CUresult status;
|
||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
CUmodule cuModule;
|
CUmodule cuModule;
|
||||||
CUfunction cuFunction;
|
CUfunction cuFunction;
|
||||||
int major, minor, block_size, devID = 0;
|
int major, minor, block_size, devID = 0;
|
||||||
char deviceName[256];
|
char deviceName[256];
|
||||||
|
|
||||||
// link to cuda driver dynamically
|
// link to cuda driver dynamically
|
||||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||||
|
|
||||||
// This assumes that the user is attempting to specify a explicit device -device=n
|
// This assumes that the user is attempting to specify a explicit device -device=n
|
||||||
if (argc > 1)
|
if (argc > 1) {
|
||||||
{
|
|
||||||
bool bFound = false;
|
bool bFound = false;
|
||||||
|
|
||||||
for (int param=0; param < argc; param++)
|
for (int param = 0; param < argc; param++) {
|
||||||
{
|
if (!strncmp(argv[param], "-device", 7)) {
|
||||||
if (!strncmp(argv[param], "-device", 7))
|
int i = (int)strlen(argv[1]);
|
||||||
{
|
|
||||||
int i=(int)strlen(argv[1]);
|
|
||||||
|
|
||||||
while (argv[1][i] != '=')
|
while (argv[1][i] != '=') {
|
||||||
{
|
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
|
|
||||||
devID = atoi(&argv[1][++i]);
|
devID = atoi(&argv[1][++i]);
|
||||||
bFound = true;
|
bFound = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
int deviceCount = 0;
|
int deviceCount = 0;
|
||||||
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||||
|
|
||||||
if (deviceCount == 0)
|
if (deviceCount == 0) {
|
||||||
{
|
|
||||||
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
|
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (devID < 0) devID = 0;
|
if (devID < 0)
|
||||||
|
devID = 0;
|
||||||
|
|
||||||
if (devID > deviceCount -1)
|
if (devID > deviceCount - 1) {
|
||||||
{
|
|
||||||
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
|
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
|
||||||
status = CUDA_ERROR_NOT_FOUND;
|
status = CUDA_ERROR_NOT_FOUND;
|
||||||
|
|
||||||
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
|
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
|
||||||
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
|
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
|
||||||
|
|
||||||
block_size = 32;
|
block_size = 32;
|
||||||
*block_size_out = block_size;
|
*block_size_out = block_size;
|
||||||
|
|
||||||
// create context for picked device
|
// create context for picked device
|
||||||
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
|
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
|
||||||
|
|
||||||
if (CUDA_SUCCESS != status)
|
if (CUDA_SUCCESS != status) {
|
||||||
{
|
|
||||||
cuCtxDestroy(g_cuContext);
|
cuCtxDestroy(g_cuContext);
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
{
|
{
|
||||||
// in this branch we use compilation with parameters
|
// in this branch we use compilation with parameters
|
||||||
const unsigned int jitNumOptions = 3;
|
const unsigned int jitNumOptions = 3;
|
||||||
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
||||||
void **jitOptVals = new void *[jitNumOptions];
|
void **jitOptVals = new void *[jitNumOptions];
|
||||||
|
|
||||||
// set up size of compilation log buffer
|
// set up size of compilation log buffer
|
||||||
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||||
int jitLogBufferSize = 1024;
|
int jitLogBufferSize = 1024;
|
||||||
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
|
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
|
||||||
|
|
||||||
// set up pointer to the compilation log buffer
|
// set up pointer to the compilation log buffer
|
||||||
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
|
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||||
char *jitLogBuffer = new char[jitLogBufferSize];
|
char *jitLogBuffer = new char[jitLogBufferSize];
|
||||||
jitOptVals[1] = jitLogBuffer;
|
jitOptVals[1] = jitLogBuffer;
|
||||||
|
|
||||||
// set up pointer to set the Maximum # of registers for a particular kernel
|
// set up pointer to set the Maximum # of registers for a particular kernel
|
||||||
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
||||||
int jitRegCount = 32;
|
int jitRegCount = 32;
|
||||||
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
||||||
|
|
||||||
// compile with set parameters
|
// compile with set parameters
|
||||||
printf("> Compiling CUDA module\n");
|
printf("> Compiling CUDA module\n");
|
||||||
|
|
||||||
#if defined(_WIN64) || defined(__LP64__)
|
#if defined(_WIN64) || defined(__LP64__)
|
||||||
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
status =
|
||||||
|
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||||
#else
|
#else
|
||||||
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
status =
|
||||||
|
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||||
|
|
||||||
delete [] jitOptions;
|
delete[] jitOptions;
|
||||||
delete [] jitOptVals;
|
delete[] jitOptVals;
|
||||||
delete [] jitLogBuffer;
|
delete[] jitLogBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CUDA_SUCCESS != status)
|
if (CUDA_SUCCESS != status) {
|
||||||
{
|
|
||||||
printf("Error while compiling PTX\n");
|
printf("Error while compiling PTX\n");
|
||||||
cuCtxDestroy(g_cuContext);
|
cuCtxDestroy(g_cuContext);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// retrieve CUDA function from the compiled module
|
// retrieve CUDA function from the compiled module
|
||||||
status = cuModuleGetFunction(&cuFunction, cuModule,
|
status = cuModuleGetFunction(
|
||||||
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
|
&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
|
||||||
|
|
||||||
if (CUDA_SUCCESS != status)
|
if (CUDA_SUCCESS != status) {
|
||||||
{
|
|
||||||
cuCtxDestroy(g_cuContext);
|
cuCtxDestroy(g_cuContext);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -233,21 +226,21 @@ int main(int argc, char **argv)
|
|||||||
printf("[ %s ]\n", sSDKsample);
|
printf("[ %s ]\n", sSDKsample);
|
||||||
|
|
||||||
// initialize CUDA
|
// initialize CUDA
|
||||||
CUfunction matrixMul = NULL;
|
CUfunction matrixMul = NULL;
|
||||||
int block_size = 0;
|
int block_size = 0;
|
||||||
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
|
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
|
||||||
|
|
||||||
// set seed for rand()
|
// set seed for rand()
|
||||||
srand(2006);
|
srand(2006);
|
||||||
|
|
||||||
// allocate host memory for matrices A and B
|
// allocate host memory for matrices A and B
|
||||||
size_t size_A = WA * HA;
|
size_t size_A = WA * HA;
|
||||||
size_t mem_size_A = sizeof(float) * size_A;
|
size_t mem_size_A = sizeof(float) * size_A;
|
||||||
size_t size_B = WB * HB;
|
size_t size_B = WB * HB;
|
||||||
size_t mem_size_B = sizeof(float) * size_B;
|
size_t mem_size_B = sizeof(float) * size_B;
|
||||||
|
|
||||||
float *h_A = (float *) malloc(mem_size_A);
|
float *h_A = (float *)malloc(mem_size_A);
|
||||||
float *h_B = (float *) malloc(mem_size_B);
|
float *h_B = (float *)malloc(mem_size_B);
|
||||||
|
|
||||||
// initialize host memory
|
// initialize host memory
|
||||||
randomInit(h_A, size_A);
|
randomInit(h_A, size_A);
|
||||||
@ -264,26 +257,24 @@ int main(int argc, char **argv)
|
|||||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||||
|
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
size_t size_C = WC * HC;
|
size_t size_C = WC * HC;
|
||||||
size_t mem_size_C = sizeof(float) * size_C;
|
size_t mem_size_C = sizeof(float) * size_C;
|
||||||
|
|
||||||
CUdeviceptr d_C;
|
CUdeviceptr d_C;
|
||||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
float *h_C = (float *) malloc(mem_size_C);
|
float *h_C = (float *)malloc(mem_size_C);
|
||||||
|
|
||||||
#if __CUDA_API_VERSION >= 4000
|
#if __CUDA_API_VERSION >= 4000
|
||||||
{
|
{
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
|
||||||
int Matrix_Width_A = WA;
|
int Matrix_Width_A = WA;
|
||||||
int Matrix_Width_B = WB;
|
int Matrix_Width_B = WB;
|
||||||
void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
|
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
|
checkCudaErrors(cuLaunchKernel(
|
||||||
block_size , block_size , 1,
|
matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
|
||||||
0,
|
|
||||||
NULL, args, NULL));
|
|
||||||
}
|
}
|
||||||
#else // __CUDA_API_VERSION <= 3020
|
#else // __CUDA_API_VERSION <= 3020
|
||||||
{
|
{
|
||||||
@ -312,7 +303,7 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
checkCudaErrors(cuParamSetSize(matrixMul, offset));
|
checkCudaErrors(cuParamSetSize(matrixMul, offset));
|
||||||
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
|
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
|
||||||
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
|
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
|
||||||
|
|
||||||
// set execution configuration for the CUDA kernel
|
// set execution configuration for the CUDA kernel
|
||||||
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
|
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
|
||||||
@ -322,19 +313,18 @@ int main(int argc, char **argv)
|
|||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
|
|
||||||
// copy result from device to host
|
// copy result from device to host
|
||||||
checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
|
checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
|
||||||
|
|
||||||
// compute reference solution
|
// compute reference solution
|
||||||
float *reference = (float *) malloc(mem_size_C);
|
float *reference = (float *)malloc(mem_size_C);
|
||||||
computeGold(reference, h_A, h_B, HA, WA, WB);
|
computeGold(reference, h_A, h_B, HA, WA, WB);
|
||||||
|
|
||||||
// check result
|
// check result
|
||||||
float diff=0.0f;
|
float diff = 0.0f;
|
||||||
|
|
||||||
for (unsigned int i=0; i<size_C; i++)
|
for (unsigned int i = 0; i < size_C; i++) {
|
||||||
{
|
|
||||||
float tmp = reference[i] - h_C[i];
|
float tmp = reference[i] - h_C[i];
|
||||||
diff += tmp*tmp;
|
diff += tmp * tmp;
|
||||||
}
|
}
|
||||||
|
|
||||||
int res = (diff / (float)size_C < 1e-6f);
|
int res = (diff / (float)size_C < 1e-6f);
|
||||||
@ -349,7 +339,7 @@ int main(int argc, char **argv)
|
|||||||
checkCudaErrors(cuMemFree(d_C));
|
checkCudaErrors(cuMemFree(d_C));
|
||||||
checkCudaErrors(cuCtxDestroy(g_cuContext));
|
checkCudaErrors(cuCtxDestroy(g_cuContext));
|
||||||
|
|
||||||
printf("Test run %s\n", (1==res) ? "success!" : "failed!");
|
printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
|
||||||
|
|
||||||
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
@ -28,8 +28,7 @@
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// export C interface
|
// export C interface
|
||||||
extern "C"
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||||
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Compute reference data set
|
//! Compute reference data set
|
||||||
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
|
|||||||
//! @param hA height of matrix A
|
//! @param hA height of matrix A
|
||||||
//! @param wB width of matrix B
|
//! @param wB width of matrix B
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void
|
void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
|
||||||
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
|
|
||||||
{
|
{
|
||||||
for (unsigned int i = 0; i < hA; ++i)
|
for (unsigned int i = 0; i < hA; ++i)
|
||||||
for (unsigned int j = 0; j < wB; ++j)
|
for (unsigned int j = 0; j < wB; ++j) {
|
||||||
{
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
|
|
||||||
for (unsigned int k = 0; k < wA; ++k)
|
for (unsigned int k = 0; k < wA; ++k) {
|
||||||
{
|
|
||||||
double a = A[i * wA + k];
|
double a = A[i * wA + k];
|
||||||
double b = B[k * wB + j];
|
double b = B[k * wB + j];
|
||||||
sum += a * b;
|
sum += a * b;
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,8 @@
|
|||||||
#define __matrixMul_kernel_32_ptxdump_h__
|
#define __matrixMul_kernel_32_ptxdump_h__
|
||||||
|
|
||||||
#if defined __cplusplus
|
#if defined __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern unsigned char matrixMul_kernel_32_ptxdump[25784];
|
extern unsigned char matrixMul_kernel_32_ptxdump[25784];
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,8 @@
|
|||||||
#define __matrixMul_kernel_64_ptxdump_h__
|
#define __matrixMul_kernel_64_ptxdump_h__
|
||||||
|
|
||||||
#if defined __cplusplus
|
#if defined __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern unsigned char matrixMul_kernel_64_ptxdump[26489];
|
extern unsigned char matrixMul_kernel_64_ptxdump[26489];
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
This sample implements matrix multiplication and is exactly the same as Chapter 6 of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the new CUDA 4.0 interface for CUBLAS to demonstrate high-performance performance for matrix multiplication.
|
This sample implements matrix multiplication and is exactly the same as the second example of the [Shared Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory) section of the programming guide. It has been written for clarity of exposition to illustrate various CUDA programming principles, not with the goal of providing the most performant generic kernel for matrix multiplication. To illustrate GPU performance for matrix multiply, this sample also shows how to use the CUDA 4.0+ interface for cuBLAS to demonstrate high-performance performance for matrix multiplication.
|
||||||
|
|
||||||
## Key Concepts
|
## Key Concepts
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ cuMemcpyDtoH, cuLaunchKernel, cuMemcpyHtoD, cuCtxSynchronize, cuMemAlloc, cuMemF
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -42,207 +42,208 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "nvrtc_helper.h"
|
#include "nvrtc_helper.h"
|
||||||
|
|
||||||
// Helper functions and utilities to work with CUDA
|
// Helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
|
|
||||||
void constantInit(float *data, int size, float val) {
|
void constantInit(float *data, int size, float val)
|
||||||
for (int i = 0; i < size; ++i) {
|
{
|
||||||
data[i] = val;
|
for (int i = 0; i < size; ++i) {
|
||||||
}
|
data[i] = val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run a simple test of matrix multiplication using CUDA
|
* Run a simple test of matrix multiplication using CUDA
|
||||||
*/
|
*/
|
||||||
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
|
||||||
dim3 &dimsB) {
|
{
|
||||||
// Allocate host memory for matrices A and B
|
// Allocate host memory for matrices A and B
|
||||||
unsigned int size_A = dimsA.x * dimsA.y;
|
unsigned int size_A = dimsA.x * dimsA.y;
|
||||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||||
float *h_A = (float *)malloc(mem_size_A);
|
float *h_A = (float *)malloc(mem_size_A);
|
||||||
unsigned int size_B = dimsB.x * dimsB.y;
|
unsigned int size_B = dimsB.x * dimsB.y;
|
||||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||||
float *h_B = (float *)malloc(mem_size_B);
|
float *h_B = (float *)malloc(mem_size_B);
|
||||||
|
|
||||||
// Initialize host memory
|
// Initialize host memory
|
||||||
const float valB = 0.01f;
|
const float valB = 0.01f;
|
||||||
constantInit(h_A, size_A, 1.0f);
|
constantInit(h_A, size_A, 1.0f);
|
||||||
constantInit(h_B, size_B, valB);
|
constantInit(h_B, size_B, valB);
|
||||||
|
|
||||||
// Allocate device memory
|
// Allocate device memory
|
||||||
CUdeviceptr d_A, d_B, d_C;
|
CUdeviceptr d_A, d_B, d_C;
|
||||||
|
|
||||||
char *cubin, *kernel_file;
|
char *cubin, *kernel_file;
|
||||||
size_t cubinSize;
|
size_t cubinSize;
|
||||||
|
|
||||||
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
|
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
|
||||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
|
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
|
||||||
|
|
||||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||||
|
|
||||||
// Allocate host matrix C
|
// Allocate host matrix C
|
||||||
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
||||||
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
||||||
float *h_C = (float *)malloc(mem_size_C);
|
float *h_C = (float *)malloc(mem_size_C);
|
||||||
|
|
||||||
if (h_C == NULL) {
|
if (h_C == NULL) {
|
||||||
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
|
||||||
|
|
||||||
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
|
||||||
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
|
||||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
|
||||||
|
|
||||||
// copy host memory to device
|
|
||||||
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
|
||||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
|
||||||
|
|
||||||
// Setup execution parameters
|
|
||||||
dim3 threads(block_size, block_size);
|
|
||||||
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
|
||||||
|
|
||||||
// Create and start timer
|
|
||||||
printf("Computing result using CUDA Kernel...\n");
|
|
||||||
|
|
||||||
CUfunction kernel_addr;
|
|
||||||
if (block_size == 16) {
|
|
||||||
checkCudaErrors(
|
|
||||||
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
|
|
||||||
} else {
|
|
||||||
checkCudaErrors(
|
|
||||||
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
|
|
||||||
}
|
|
||||||
|
|
||||||
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
|
|
||||||
(void *)&dimsB.x};
|
|
||||||
|
|
||||||
// Execute the kernel
|
|
||||||
int nIter = 300;
|
|
||||||
|
|
||||||
for (int j = 0; j < nIter; j++) {
|
|
||||||
checkCudaErrors(
|
|
||||||
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
|
|
||||||
threads.x, threads.y, threads.z, /* block dim */
|
|
||||||
0, 0, /* shared mem, stream */
|
|
||||||
&arr[0], /* arguments */
|
|
||||||
0));
|
|
||||||
|
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Copy result from device to host
|
|
||||||
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
|
|
||||||
|
|
||||||
printf("Checking computed result for correctness: ");
|
|
||||||
|
|
||||||
bool correct = true;
|
|
||||||
|
|
||||||
// test relative error by the formula
|
|
||||||
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
|
||||||
|
|
||||||
double eps = 1.e-6; // machine zero
|
|
||||||
|
|
||||||
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
|
|
||||||
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
|
||||||
double dot_length = dimsA.x;
|
|
||||||
double abs_val = fabs(h_C[i]);
|
|
||||||
double rel_err = abs_err / abs_val / dot_length;
|
|
||||||
|
|
||||||
if (rel_err > eps) {
|
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
|
|
||||||
h_C[i], dimsA.x * valB, eps);
|
|
||||||
correct = false;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
||||||
|
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
||||||
|
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||||
|
|
||||||
printf(
|
// copy host memory to device
|
||||||
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
||||||
"Results may vary when GPU Boost is enabled.\n");
|
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||||
|
|
||||||
// Clean up memory
|
// Setup execution parameters
|
||||||
free(h_A);
|
dim3 threads(block_size, block_size);
|
||||||
free(h_B);
|
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
||||||
free(h_C);
|
|
||||||
|
|
||||||
checkCudaErrors(cuMemFree(d_A));
|
// Create and start timer
|
||||||
checkCudaErrors(cuMemFree(d_B));
|
printf("Computing result using CUDA Kernel...\n");
|
||||||
checkCudaErrors(cuMemFree(d_C));
|
|
||||||
|
|
||||||
if (correct) {
|
CUfunction kernel_addr;
|
||||||
return EXIT_SUCCESS;
|
if (block_size == 16) {
|
||||||
} else {
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
|
||||||
return EXIT_FAILURE;
|
}
|
||||||
}
|
else {
|
||||||
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
|
||||||
|
}
|
||||||
|
|
||||||
|
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
|
||||||
|
|
||||||
|
// Execute the kernel
|
||||||
|
int nIter = 300;
|
||||||
|
|
||||||
|
for (int j = 0; j < nIter; j++) {
|
||||||
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
|
grid.x,
|
||||||
|
grid.y,
|
||||||
|
grid.z, /* grid dim */
|
||||||
|
threads.x,
|
||||||
|
threads.y,
|
||||||
|
threads.z, /* block dim */
|
||||||
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
|
&arr[0], /* arguments */
|
||||||
|
0));
|
||||||
|
|
||||||
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy result from device to host
|
||||||
|
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
|
||||||
|
|
||||||
|
printf("Checking computed result for correctness: ");
|
||||||
|
|
||||||
|
bool correct = true;
|
||||||
|
|
||||||
|
// test relative error by the formula
|
||||||
|
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
||||||
|
|
||||||
|
double eps = 1.e-6; // machine zero
|
||||||
|
|
||||||
|
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
|
||||||
|
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
||||||
|
double dot_length = dimsA.x;
|
||||||
|
double abs_val = fabs(h_C[i]);
|
||||||
|
double rel_err = abs_err / abs_val / dot_length;
|
||||||
|
|
||||||
|
if (rel_err > eps) {
|
||||||
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
|
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||||
|
"Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
|
// Clean up memory
|
||||||
|
free(h_A);
|
||||||
|
free(h_B);
|
||||||
|
free(h_C);
|
||||||
|
|
||||||
|
checkCudaErrors(cuMemFree(d_A));
|
||||||
|
checkCudaErrors(cuMemFree(d_B));
|
||||||
|
checkCudaErrors(cuMemFree(d_C));
|
||||||
|
|
||||||
|
if (correct) {
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Program main
|
* Program main
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
{
|
||||||
|
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
||||||
printf(
|
|
||||||
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
int block_size = 32;
|
int block_size = 32;
|
||||||
|
|
||||||
// original:
|
// original:
|
||||||
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
||||||
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
||||||
|
|
||||||
// reduce sizes to avoid running out of memory
|
// reduce sizes to avoid running out of memory
|
||||||
// dim3 dimsA(32,32, 1);
|
// dim3 dimsA(32,32, 1);
|
||||||
// dim3 dimsB(32,32,1);
|
// dim3 dimsB(32,32,1);
|
||||||
|
|
||||||
// width of Matrix A
|
// width of Matrix A
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
||||||
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
||||||
}
|
}
|
||||||
|
|
||||||
// height of Matrix A
|
// height of Matrix A
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
||||||
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
||||||
}
|
}
|
||||||
|
|
||||||
// width of Matrix B
|
// width of Matrix B
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
||||||
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
||||||
}
|
}
|
||||||
|
|
||||||
// height of Matrix B
|
// height of Matrix B
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
||||||
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dimsA.x != dimsB.y) {
|
if (dimsA.x != dimsB.y) {
|
||||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
|
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
|
||||||
dimsA.x, dimsB.y);
|
exit(EXIT_FAILURE);
|
||||||
exit(EXIT_FAILURE);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
|
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
|
||||||
dimsB.y);
|
|
||||||
|
|
||||||
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||||
|
|
||||||
exit(matrix_result);
|
exit(matrix_result);
|
||||||
}
|
}
|
||||||
|
@ -48,84 +48,83 @@
|
|||||||
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
|
||||||
template <int BLOCK_SIZE>
|
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
|
||||||
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cooperative_groups::thread_block cta =
|
cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
|
||||||
cooperative_groups::this_thread_block();
|
// Block index
|
||||||
// Block index
|
int bx = blockIdx.x;
|
||||||
int bx = blockIdx.x;
|
int by = blockIdx.y;
|
||||||
int by = blockIdx.y;
|
|
||||||
|
|
||||||
// Thread index
|
// Thread index
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
int ty = threadIdx.y;
|
int ty = threadIdx.y;
|
||||||
|
|
||||||
// Index of the first sub-matrix of A processed by the block
|
// Index of the first sub-matrix of A processed by the block
|
||||||
int aBegin = wA * BLOCK_SIZE * by;
|
int aBegin = wA * BLOCK_SIZE * by;
|
||||||
|
|
||||||
// Index of the last sub-matrix of A processed by the block
|
// Index of the last sub-matrix of A processed by the block
|
||||||
int aEnd = aBegin + wA - 1;
|
int aEnd = aBegin + wA - 1;
|
||||||
|
|
||||||
// Step size used to iterate through the sub-matrices of A
|
// Step size used to iterate through the sub-matrices of A
|
||||||
int aStep = BLOCK_SIZE;
|
int aStep = BLOCK_SIZE;
|
||||||
|
|
||||||
// Index of the first sub-matrix of B processed by the block
|
// Index of the first sub-matrix of B processed by the block
|
||||||
int bBegin = BLOCK_SIZE * bx;
|
int bBegin = BLOCK_SIZE * bx;
|
||||||
|
|
||||||
// Step size used to iterate through the sub-matrices of B
|
// Step size used to iterate through the sub-matrices of B
|
||||||
int bStep = BLOCK_SIZE * wB;
|
int bStep = BLOCK_SIZE * wB;
|
||||||
|
|
||||||
// Csub is used to store the element of the block sub-matrix
|
// Csub is used to store the element of the block sub-matrix
|
||||||
// that is computed by the thread
|
// that is computed by the thread
|
||||||
float Csub = 0;
|
float Csub = 0;
|
||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A
|
// store the sub-matrix of A
|
||||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
|
||||||
// Declaration of the shared memory array Bs used to
|
// Declaration of the shared memory array Bs used to
|
||||||
// store the sub-matrix of B
|
// store the sub-matrix of B
|
||||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
|
|
||||||
// Load the matrices from device memory
|
// Load the matrices from device memory
|
||||||
// to shared memory; each thread loads
|
// to shared memory; each thread loads
|
||||||
// one element of each matrix
|
// one element of each matrix
|
||||||
As[ty][tx] = A[a + wA * ty + tx];
|
As[ty][tx] = A[a + wA * ty + tx];
|
||||||
Bs[ty][tx] = B[b + wB * ty + tx];
|
Bs[ty][tx] = B[b + wB * ty + tx];
|
||||||
|
|
||||||
// Synchronize to make sure the matrices are loaded
|
// Synchronize to make sure the matrices are loaded
|
||||||
cooperative_groups::sync(cta);
|
cooperative_groups::sync(cta);
|
||||||
|
|
||||||
// Multiply the two matrices together;
|
// Multiply the two matrices together;
|
||||||
// each thread computes one element
|
// each thread computes one element
|
||||||
// of the block sub-matrix
|
// of the block sub-matrix
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
||||||
Csub += As[ty][k] * Bs[k][tx];
|
Csub += As[ty][k] * Bs[k][tx];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Synchronize to make sure that the preceding
|
||||||
|
// computation is done before loading two new
|
||||||
|
// sub-matrices of A and B in the next iteration
|
||||||
|
cooperative_groups::sync(cta);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Synchronize to make sure that the preceding
|
// Write the block sub-matrix to device memory;
|
||||||
// computation is done before loading two new
|
// each thread writes one element
|
||||||
// sub-matrices of A and B in the next iteration
|
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||||
cooperative_groups::sync(cta);
|
C[c + wB * ty + tx] = Csub;
|
||||||
}
|
|
||||||
|
|
||||||
// Write the block sub-matrix to device memory;
|
|
||||||
// each thread writes one element
|
|
||||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
|
||||||
C[c + wB * ty + tx] = Csub;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
|
||||||
int wA, int wB) {
|
{
|
||||||
matrixMulCUDA<16>(C, A, B, wA, wB);
|
matrixMulCUDA<16>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
|
||||||
int wA, int wB) {
|
{
|
||||||
matrixMulCUDA<32>(C, A, B, wA, wB);
|
matrixMulCUDA<32>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaMalloc, cudaDeviceSynchronize, cudaMemcpy, cudaFree
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -28,252 +28,254 @@
|
|||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
|
||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
#include <helper_cuda.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <helper_cuda.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
|
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
|
||||||
uint &valB, uint arrowDir) {
|
{
|
||||||
uint t;
|
uint t;
|
||||||
|
|
||||||
if ((keyA > keyB) == arrowDir) {
|
if ((keyA > keyB) == arrowDir) {
|
||||||
t = keyA;
|
t = keyA;
|
||||||
keyA = keyB;
|
keyA = keyB;
|
||||||
keyB = t;
|
keyB = t;
|
||||||
t = valA;
|
t = valA;
|
||||||
valA = valB;
|
valA = valB;
|
||||||
valB = t;
|
valB = t;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
__global__ void
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
|
||||||
uint arrayLength, uint sortDir) {
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
// Shared memory storage for one or more short vectors
|
// Shared memory storage for one or more short vectors
|
||||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||||
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
||||||
|
|
||||||
// Offset to the beginning of subbatch and load data
|
// Offset to the beginning of subbatch and load data
|
||||||
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||||
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
|
||||||
|
|
||||||
for (uint size = 2; size < arrayLength; size <<= 1) {
|
for (uint size = 2; size < arrayLength; size <<= 1) {
|
||||||
// Bitonic merge
|
// Bitonic merge
|
||||||
uint dir = (threadIdx.x & (size / 2)) != 0;
|
uint dir = (threadIdx.x & (size / 2)) != 0;
|
||||||
|
|
||||||
for (uint stride = size / 2; stride > 0; stride >>= 1) {
|
for (uint stride = size / 2; stride > 0; stride >>= 1) {
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
|
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
|
||||||
s_val[pos + stride], dir);
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// ddd == sortDir for the last bitonic merge step
|
// ddd == sortDir for the last bitonic merge step
|
||||||
{
|
{
|
||||||
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
|
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
|
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
|
||||||
s_val[pos + stride], sortDir);
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
|
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function (also used by odd-even merge sort)
|
// Helper function (also used by odd-even merge sort)
|
||||||
extern "C" uint factorRadix2(uint *log2L, uint L) {
|
extern "C" uint factorRadix2(uint *log2L, uint L)
|
||||||
if (!L) {
|
{
|
||||||
*log2L = 0;
|
if (!L) {
|
||||||
return 0;
|
*log2L = 0;
|
||||||
} else {
|
return 0;
|
||||||
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
|
}
|
||||||
;
|
else {
|
||||||
|
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
|
||||||
|
;
|
||||||
|
|
||||||
return L;
|
return L;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
extern "C" void bitonicSortShared(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint batchSize, uint arrayLength,
|
uint *d_SrcKey,
|
||||||
uint sortDir) {
|
uint *d_SrcVal,
|
||||||
// Nothing to sort
|
uint batchSize,
|
||||||
if (arrayLength < 2) {
|
uint arrayLength,
|
||||||
return;
|
uint sortDir)
|
||||||
}
|
{
|
||||||
|
// Nothing to sort
|
||||||
|
if (arrayLength < 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Only power-of-two array lengths are supported by this implementation
|
// Only power-of-two array lengths are supported by this implementation
|
||||||
uint log2L;
|
uint log2L;
|
||||||
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
|
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
|
||||||
assert(factorizationRemainder == 1);
|
assert(factorizationRemainder == 1);
|
||||||
|
|
||||||
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
||||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||||
|
|
||||||
assert(arrayLength <= SHARED_SIZE_LIMIT);
|
assert(arrayLength <= SHARED_SIZE_LIMIT);
|
||||||
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
|
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
|
||||||
|
|
||||||
bitonicSortSharedKernel<<<blockCount, threadCount>>>(
|
bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
|
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
|
||||||
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 3: merge elementary intervals
|
// Merge step 3: merge elementary intervals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
|
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
|
||||||
return ((a % b) == 0) ? (a / b) : (a / b + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
|
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
|
||||||
return iDivUp(dividend, SAMPLE_STRIDE);
|
|
||||||
|
template <uint sortDir>
|
||||||
|
static inline __device__ void
|
||||||
|
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
|
||||||
|
{
|
||||||
|
uint t;
|
||||||
|
|
||||||
|
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
|
||||||
|
|| ((arrowDir != sortDir) && (flagB == 1))) {
|
||||||
|
t = keyA;
|
||||||
|
keyA = keyB;
|
||||||
|
keyB = t;
|
||||||
|
t = valA;
|
||||||
|
valA = valB;
|
||||||
|
valB = t;
|
||||||
|
t = flagA;
|
||||||
|
flagA = flagB;
|
||||||
|
flagB = t;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
|
__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
|
||||||
uint &flagA, uint &keyB,
|
uint *d_DstVal,
|
||||||
uint &valB, uint &flagB,
|
uint *d_SrcKey,
|
||||||
uint arrowDir) {
|
uint *d_SrcVal,
|
||||||
uint t;
|
uint *d_LimitsA,
|
||||||
|
uint *d_LimitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N)
|
||||||
|
{
|
||||||
|
// Handle to thread block group
|
||||||
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
|
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||||
|
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
||||||
|
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
|
||||||
|
|
||||||
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
|
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||||
((arrowDir == sortDir) && (flagA == 1)) ||
|
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
||||||
((arrowDir != sortDir) && (flagB == 1))) {
|
d_SrcKey += segmentBase;
|
||||||
t = keyA;
|
d_SrcVal += segmentBase;
|
||||||
keyA = keyB;
|
d_DstKey += segmentBase;
|
||||||
keyB = t;
|
d_DstVal += segmentBase;
|
||||||
t = valA;
|
|
||||||
valA = valB;
|
|
||||||
valB = t;
|
|
||||||
t = flagA;
|
|
||||||
flagA = flagB;
|
|
||||||
flagB = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <uint sortDir>
|
// Set up threadblock-wide parameters
|
||||||
__global__ void bitonicMergeElementaryIntervalsKernel(
|
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
|
||||||
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
|
|
||||||
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
|
|
||||||
// Handle to thread block group
|
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
|
||||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
|
||||||
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
|
||||||
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
|
|
||||||
|
|
||||||
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
if (threadIdx.x == 0) {
|
||||||
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
uint segmentElementsA = stride;
|
||||||
d_SrcKey += segmentBase;
|
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||||
d_SrcVal += segmentBase;
|
uint segmentSamplesA = stride / SAMPLE_STRIDE;
|
||||||
d_DstKey += segmentBase;
|
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||||
d_DstVal += segmentBase;
|
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||||
|
|
||||||
// Set up threadblock-wide parameters
|
startSrcA = d_LimitsA[blockIdx.x];
|
||||||
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
|
startSrcB = d_LimitsB[blockIdx.x];
|
||||||
|
startDst = startSrcA + startSrcB;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
|
||||||
uint segmentElementsA = stride;
|
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
|
||||||
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
lenSrcA = endSrcA - startSrcA;
|
||||||
uint segmentSamplesA = stride / SAMPLE_STRIDE;
|
lenSrcB = endSrcB - startSrcB;
|
||||||
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
}
|
||||||
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
|
||||||
|
|
||||||
startSrcA = d_LimitsA[blockIdx.x];
|
s_inf[threadIdx.x + 0] = 1;
|
||||||
startSrcB = d_LimitsB[blockIdx.x];
|
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
|
||||||
startDst = startSrcA + startSrcB;
|
|
||||||
|
|
||||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
|
// Load input data
|
||||||
: segmentElementsA;
|
|
||||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
|
|
||||||
: segmentElementsB;
|
|
||||||
lenSrcA = endSrcA - startSrcA;
|
|
||||||
lenSrcB = endSrcB - startSrcB;
|
|
||||||
}
|
|
||||||
|
|
||||||
s_inf[threadIdx.x + 0] = 1;
|
|
||||||
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
|
|
||||||
|
|
||||||
// Load input data
|
|
||||||
cg::sync(cta);
|
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcA) {
|
|
||||||
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
|
||||||
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
|
||||||
s_inf[threadIdx.x] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prepare for bitonic merge by inversing the ordering
|
|
||||||
if (threadIdx.x < lenSrcB) {
|
|
||||||
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
|
|
||||||
d_SrcKey[stride + startSrcB + threadIdx.x];
|
|
||||||
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
|
|
||||||
d_SrcVal[stride + startSrcB + threadIdx.x];
|
|
||||||
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
//"Extended" bitonic merge
|
|
||||||
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
|
||||||
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
|
|
||||||
s_key[pos + stride], s_val[pos + stride],
|
|
||||||
s_inf[pos + stride], sortDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Store sorted data
|
if (threadIdx.x < lenSrcA) {
|
||||||
cg::sync(cta);
|
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
||||||
d_DstKey += startDst;
|
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
||||||
d_DstVal += startDst;
|
s_inf[threadIdx.x] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcA) {
|
// Prepare for bitonic merge by inversing the ordering
|
||||||
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
|
if (threadIdx.x < lenSrcB) {
|
||||||
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
|
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||||
}
|
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||||
|
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcB) {
|
//"Extended" bitonic merge
|
||||||
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
|
||||||
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
cg::sync(cta);
|
||||||
}
|
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||||
|
ComparatorExtended<sortDir>(s_key[pos + 0],
|
||||||
|
s_val[pos + 0],
|
||||||
|
s_inf[pos + 0],
|
||||||
|
s_key[pos + stride],
|
||||||
|
s_val[pos + stride],
|
||||||
|
s_inf[pos + stride],
|
||||||
|
sortDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store sorted data
|
||||||
|
cg::sync(cta);
|
||||||
|
d_DstKey += startDst;
|
||||||
|
d_DstVal += startDst;
|
||||||
|
|
||||||
|
if (threadIdx.x < lenSrcA) {
|
||||||
|
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
|
||||||
|
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x < lenSrcB) {
|
||||||
|
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
||||||
|
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
uint *d_LimitsA,
|
uint *d_LimitsA,
|
||||||
uint *d_LimitsB, uint stride,
|
uint *d_LimitsB,
|
||||||
uint N, uint sortDir) {
|
uint stride,
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
|
|
||||||
uint mergePairs = (lastSegmentElements > stride)
|
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
? getSampleCount(N)
|
|
||||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
|
bitonicMergeElementaryIntervalsKernel<1U>
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
N);
|
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
}
|
||||||
} else {
|
else {
|
||||||
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
|
bitonicMergeElementaryIntervalsKernel<0U>
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
N);
|
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -26,96 +26,94 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Test driver
|
// Test driver
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
|
{
|
||||||
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
|
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
|
||||||
StopWatchInterface *hTimer = NULL;
|
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
|
||||||
|
StopWatchInterface *hTimer = NULL;
|
||||||
|
|
||||||
const uint N = 4 * 1048576;
|
const uint N = 4 * 1048576;
|
||||||
const uint DIR = 1;
|
const uint DIR = 1;
|
||||||
const uint numValues = 65536;
|
const uint numValues = 65536;
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
int dev = findCudaDevice(argc, (const char **)argv);
|
int dev = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
if (dev == -1) {
|
if (dev == -1) {
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Allocating and initializing host arrays...\n\n");
|
printf("Allocating and initializing host arrays...\n\n");
|
||||||
sdkCreateTimer(&hTimer);
|
sdkCreateTimer(&hTimer);
|
||||||
h_SrcKey = (uint *)malloc(N * sizeof(uint));
|
h_SrcKey = (uint *)malloc(N * sizeof(uint));
|
||||||
h_SrcVal = (uint *)malloc(N * sizeof(uint));
|
h_SrcVal = (uint *)malloc(N * sizeof(uint));
|
||||||
h_DstKey = (uint *)malloc(N * sizeof(uint));
|
h_DstKey = (uint *)malloc(N * sizeof(uint));
|
||||||
h_DstVal = (uint *)malloc(N * sizeof(uint));
|
h_DstVal = (uint *)malloc(N * sizeof(uint));
|
||||||
|
|
||||||
srand(2009);
|
srand(2009);
|
||||||
|
|
||||||
for (uint i = 0; i < N; i++) {
|
for (uint i = 0; i < N; i++) {
|
||||||
h_SrcKey[i] = rand() % numValues;
|
h_SrcKey[i] = rand() % numValues;
|
||||||
}
|
}
|
||||||
|
|
||||||
fillValues(h_SrcVal, N);
|
fillValues(h_SrcVal, N);
|
||||||
|
|
||||||
printf("Allocating and initializing CUDA arrays...\n\n");
|
printf("Allocating and initializing CUDA arrays...\n\n");
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
|
checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
printf("Initializing GPU merge sort...\n");
|
printf("Initializing GPU merge sort...\n");
|
||||||
initMergeSort();
|
initMergeSort();
|
||||||
|
|
||||||
printf("Running GPU merge sort...\n");
|
printf("Running GPU merge sort...\n");
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
sdkResetTimer(&hTimer);
|
sdkResetTimer(&hTimer);
|
||||||
sdkStartTimer(&hTimer);
|
sdkStartTimer(&hTimer);
|
||||||
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
|
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
sdkStopTimer(&hTimer);
|
sdkStopTimer(&hTimer);
|
||||||
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
|
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
|
||||||
|
|
||||||
printf("Reading back GPU merge sort results...\n");
|
printf("Reading back GPU merge sort results...\n");
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
printf("Inspecting the results...\n");
|
printf("Inspecting the results...\n");
|
||||||
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
|
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
|
||||||
|
|
||||||
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
|
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
|
||||||
|
|
||||||
printf("Shutting down...\n");
|
printf("Shutting down...\n");
|
||||||
closeMergeSort();
|
closeMergeSort();
|
||||||
sdkDeleteTimer(&hTimer);
|
sdkDeleteTimer(&hTimer);
|
||||||
checkCudaErrors(cudaFree(d_SrcVal));
|
checkCudaErrors(cudaFree(d_SrcVal));
|
||||||
checkCudaErrors(cudaFree(d_SrcKey));
|
checkCudaErrors(cudaFree(d_SrcKey));
|
||||||
checkCudaErrors(cudaFree(d_BufVal));
|
checkCudaErrors(cudaFree(d_BufVal));
|
||||||
checkCudaErrors(cudaFree(d_BufKey));
|
checkCudaErrors(cudaFree(d_BufKey));
|
||||||
checkCudaErrors(cudaFree(d_DstVal));
|
checkCudaErrors(cudaFree(d_DstVal));
|
||||||
checkCudaErrors(cudaFree(d_DstKey));
|
checkCudaErrors(cudaFree(d_DstKey));
|
||||||
free(h_DstVal);
|
free(h_DstVal);
|
||||||
free(h_DstKey);
|
free(h_DstKey);
|
||||||
free(h_SrcVal);
|
free(h_SrcVal);
|
||||||
free(h_SrcKey);
|
free(h_SrcKey);
|
||||||
|
|
||||||
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
@ -39,491 +39,499 @@
|
|||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Helper functions
|
// Helper functions
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
|
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
|
||||||
return ((a % b) == 0) ? (a / b) : (a / b + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
|
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
|
||||||
return iDivUp(dividend, SAMPLE_STRIDE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define W (sizeof(uint) * 8)
|
#define W (sizeof(uint) * 8)
|
||||||
static inline __device__ uint nextPowerOfTwo(uint x) {
|
static inline __device__ uint nextPowerOfTwo(uint x)
|
||||||
/*
|
{
|
||||||
--x;
|
/*
|
||||||
x |= x >> 1;
|
--x;
|
||||||
x |= x >> 2;
|
x |= x >> 1;
|
||||||
x |= x >> 4;
|
x |= x >> 2;
|
||||||
x |= x >> 8;
|
x |= x >> 4;
|
||||||
x |= x >> 16;
|
x |= x >> 8;
|
||||||
return ++x;
|
x |= x >> 16;
|
||||||
*/
|
return ++x;
|
||||||
return 1U << (W - __clz(x - 1));
|
*/
|
||||||
|
return 1U << (W - __clz(x - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
|
||||||
static inline __device__ uint binarySearchInclusive(uint val, uint *data,
|
{
|
||||||
uint L, uint stride) {
|
if (L == 0) {
|
||||||
if (L == 0) {
|
return 0;
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint pos = 0;
|
|
||||||
|
|
||||||
for (; stride > 0; stride >>= 1) {
|
|
||||||
uint newPos = umin(pos + stride, L);
|
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] <= val)) ||
|
|
||||||
(!sortDir && (data[newPos - 1] >= val))) {
|
|
||||||
pos = newPos;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return pos;
|
uint pos = 0;
|
||||||
|
|
||||||
|
for (; stride > 0; stride >>= 1) {
|
||||||
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
|
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
|
||||||
|
pos = newPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
|
||||||
static inline __device__ uint binarySearchExclusive(uint val, uint *data,
|
{
|
||||||
uint L, uint stride) {
|
if (L == 0) {
|
||||||
if (L == 0) {
|
return 0;
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint pos = 0;
|
|
||||||
|
|
||||||
for (; stride > 0; stride >>= 1) {
|
|
||||||
uint newPos = umin(pos + stride, L);
|
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] < val)) ||
|
|
||||||
(!sortDir && (data[newPos - 1] > val))) {
|
|
||||||
pos = newPos;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return pos;
|
uint pos = 0;
|
||||||
|
|
||||||
|
for (; stride > 0; stride >>= 1) {
|
||||||
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
|
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
|
||||||
|
pos = newPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Bottom-level merge sort (binary search-based)
|
// Bottom-level merge sort (binary search-based)
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
{
|
||||||
uint arrayLength) {
|
// Handle to thread block group
|
||||||
// Handle to thread block group
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
||||||
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
|
||||||
|
|
||||||
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||||
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
|
||||||
|
|
||||||
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
|
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
|
||||||
uint lPos = threadIdx.x & (stride - 1);
|
uint lPos = threadIdx.x & (stride - 1);
|
||||||
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
|
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
|
||||||
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
|
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
|
||||||
|
|
||||||
|
cg::sync(cta);
|
||||||
|
uint keyA = baseKey[lPos + 0];
|
||||||
|
uint valA = baseVal[lPos + 0];
|
||||||
|
uint keyB = baseKey[lPos + stride];
|
||||||
|
uint valB = baseVal[lPos + stride];
|
||||||
|
uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
|
||||||
|
uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
|
||||||
|
|
||||||
|
cg::sync(cta);
|
||||||
|
baseKey[posA] = keyA;
|
||||||
|
baseVal[posA] = valA;
|
||||||
|
baseKey[posB] = keyB;
|
||||||
|
baseVal[posB] = valB;
|
||||||
|
}
|
||||||
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint keyA = baseKey[lPos + 0];
|
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||||
uint valA = baseVal[lPos + 0];
|
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||||
uint keyB = baseKey[lPos + stride];
|
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
uint valB = baseVal[lPos + stride];
|
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
uint posA =
|
|
||||||
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
|
|
||||||
lPos;
|
|
||||||
uint posB =
|
|
||||||
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
|
|
||||||
lPos;
|
|
||||||
|
|
||||||
cg::sync(cta);
|
|
||||||
baseKey[posA] = keyA;
|
|
||||||
baseVal[posA] = valA;
|
|
||||||
baseKey[posB] = keyB;
|
|
||||||
baseVal[posB] = valB;
|
|
||||||
}
|
|
||||||
|
|
||||||
cg::sync(cta);
|
|
||||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
|
||||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
|
||||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
|
||||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
|
static void mergeSortShared(uint *d_DstKey,
|
||||||
uint *d_SrcVal, uint batchSize, uint arrayLength,
|
uint *d_DstVal,
|
||||||
uint sortDir) {
|
uint *d_SrcKey,
|
||||||
if (arrayLength < 2) {
|
uint *d_SrcVal,
|
||||||
return;
|
uint batchSize,
|
||||||
}
|
uint arrayLength,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
|
if (arrayLength < 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
|
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
|
||||||
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
|
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
|
||||||
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
||||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
|
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
|
||||||
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
|
}
|
||||||
} else {
|
else {
|
||||||
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
|
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
|
||||||
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 1: generate sample ranks
|
// Merge step 1: generate sample ranks
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
|
__global__ void
|
||||||
uint *d_SrcKey, uint stride, uint N,
|
generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
|
||||||
uint threadCount) {
|
{
|
||||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (pos >= threadCount) {
|
if (pos >= threadCount) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||||
d_SrcKey += segmentBase;
|
d_SrcKey += segmentBase;
|
||||||
d_RanksA += segmentBase / SAMPLE_STRIDE;
|
d_RanksA += segmentBase / SAMPLE_STRIDE;
|
||||||
d_RanksB += segmentBase / SAMPLE_STRIDE;
|
d_RanksB += segmentBase / SAMPLE_STRIDE;
|
||||||
|
|
||||||
const uint segmentElementsA = stride;
|
const uint segmentElementsA = stride;
|
||||||
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||||
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||||
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||||
|
|
||||||
if (i < segmentSamplesA) {
|
if (i < segmentSamplesA) {
|
||||||
d_RanksA[i] = i * SAMPLE_STRIDE;
|
d_RanksA[i] = i * SAMPLE_STRIDE;
|
||||||
d_RanksB[i] = binarySearchExclusive<sortDir>(
|
d_RanksB[i] = binarySearchExclusive<sortDir>(
|
||||||
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
|
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
||||||
nextPowerOfTwo(segmentElementsB));
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (i < segmentSamplesB) {
|
if (i < segmentSamplesB) {
|
||||||
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||||
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
|
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
|
||||||
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
|
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
||||||
nextPowerOfTwo(segmentElementsA));
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
|
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
|
||||||
uint stride, uint N, uint sortDir) {
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint threadCount =
|
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
|
generateSampleRanksKernel<1U>
|
||||||
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||||
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
|
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
|
||||||
} else {
|
}
|
||||||
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
|
else {
|
||||||
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
generateSampleRanksKernel<0U>
|
||||||
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
|
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||||
}
|
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 2: generate sample ranks and indices
|
// Merge step 2: generate sample ranks and indices
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
|
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
|
||||||
uint stride, uint N,
|
{
|
||||||
uint threadCount) {
|
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (pos >= threadCount) {
|
if (pos >= threadCount) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||||
d_Ranks += (pos - i) * 2;
|
d_Ranks += (pos - i) * 2;
|
||||||
d_Limits += (pos - i) * 2;
|
d_Limits += (pos - i) * 2;
|
||||||
|
|
||||||
const uint segmentElementsA = stride;
|
const uint segmentElementsA = stride;
|
||||||
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||||
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||||
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||||
|
|
||||||
if (i < segmentSamplesA) {
|
if (i < segmentSamplesA) {
|
||||||
uint dstPos = binarySearchExclusive<1U>(
|
uint dstPos = binarySearchExclusive<1U>(
|
||||||
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
|
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
|
||||||
nextPowerOfTwo(segmentSamplesB)) +
|
+ i;
|
||||||
i;
|
d_Limits[dstPos] = d_Ranks[i];
|
||||||
d_Limits[dstPos] = d_Ranks[i];
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (i < segmentSamplesB) {
|
if (i < segmentSamplesB) {
|
||||||
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
|
uint dstPos = binarySearchInclusive<1U>(
|
||||||
d_Ranks, segmentSamplesA,
|
d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
|
||||||
nextPowerOfTwo(segmentSamplesA)) +
|
+ i;
|
||||||
i;
|
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
|
||||||
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
|
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
|
||||||
uint *d_RanksA, uint *d_RanksB, uint stride,
|
{
|
||||||
uint N) {
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
uint threadCount =
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
(lastSegmentElements > stride)
|
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
|
||||||
|
|
||||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
|
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
|
||||||
d_LimitsA, d_RanksA, stride, N, threadCount);
|
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
|
||||||
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
|
|
||||||
|
|
||||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
|
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
|
||||||
d_LimitsB, d_RanksB, stride, N, threadCount);
|
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
|
||||||
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 3: merge elementary intervals
|
// Merge step 3: merge elementary intervals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
|
inline __device__ void merge(uint *dstKey,
|
||||||
uint *srcAVal, uint *srcBKey, uint *srcBVal,
|
uint *dstVal,
|
||||||
uint lenA, uint nPowTwoLenA, uint lenB,
|
uint *srcAKey,
|
||||||
uint nPowTwoLenB, cg::thread_block cta) {
|
uint *srcAVal,
|
||||||
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
|
uint *srcBKey,
|
||||||
|
uint *srcBVal,
|
||||||
|
uint lenA,
|
||||||
|
uint nPowTwoLenA,
|
||||||
|
uint lenB,
|
||||||
|
uint nPowTwoLenB,
|
||||||
|
cg::thread_block cta)
|
||||||
|
{
|
||||||
|
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
|
||||||
|
|
||||||
if (threadIdx.x < lenA) {
|
if (threadIdx.x < lenA) {
|
||||||
keyA = srcAKey[threadIdx.x];
|
keyA = srcAKey[threadIdx.x];
|
||||||
valA = srcAVal[threadIdx.x];
|
valA = srcAVal[threadIdx.x];
|
||||||
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
|
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
|
||||||
threadIdx.x;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x < lenB) {
|
if (threadIdx.x < lenB) {
|
||||||
keyB = srcBKey[threadIdx.x];
|
keyB = srcBKey[threadIdx.x];
|
||||||
valB = srcBVal[threadIdx.x];
|
valB = srcBVal[threadIdx.x];
|
||||||
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
|
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
|
||||||
threadIdx.x;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
|
|
||||||
if (threadIdx.x < lenA) {
|
if (threadIdx.x < lenA) {
|
||||||
dstKey[dstPosA] = keyA;
|
dstKey[dstPosA] = keyA;
|
||||||
dstVal[dstPosA] = valA;
|
dstVal[dstPosA] = valA;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (threadIdx.x < lenB) {
|
if (threadIdx.x < lenB) {
|
||||||
dstKey[dstPosB] = keyB;
|
dstKey[dstPosB] = keyB;
|
||||||
dstVal[dstPosB] = valB;
|
dstVal[dstPosB] = valB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
|
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint *d_LimitsA, uint *d_LimitsB,
|
uint *d_SrcKey,
|
||||||
uint stride, uint N) {
|
uint *d_SrcVal,
|
||||||
// Handle to thread block group
|
uint *d_LimitsA,
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
uint *d_LimitsB,
|
||||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
uint stride,
|
||||||
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
uint N)
|
||||||
|
{
|
||||||
|
// Handle to thread block group
|
||||||
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
|
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||||
|
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
||||||
|
|
||||||
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||||
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
||||||
d_SrcKey += segmentBase;
|
d_SrcKey += segmentBase;
|
||||||
d_SrcVal += segmentBase;
|
d_SrcVal += segmentBase;
|
||||||
d_DstKey += segmentBase;
|
d_DstKey += segmentBase;
|
||||||
d_DstVal += segmentBase;
|
d_DstVal += segmentBase;
|
||||||
|
|
||||||
// Set up threadblock-wide parameters
|
// Set up threadblock-wide parameters
|
||||||
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
|
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
uint segmentElementsA = stride;
|
uint segmentElementsA = stride;
|
||||||
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||||
uint segmentSamplesA = getSampleCount(segmentElementsA);
|
uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||||
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||||
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||||
|
|
||||||
startSrcA = d_LimitsA[blockIdx.x];
|
startSrcA = d_LimitsA[blockIdx.x];
|
||||||
startSrcB = d_LimitsB[blockIdx.x];
|
startSrcB = d_LimitsB[blockIdx.x];
|
||||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
|
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
|
||||||
: segmentElementsA;
|
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
|
||||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
|
lenSrcA = endSrcA - startSrcA;
|
||||||
: segmentElementsB;
|
lenSrcB = endSrcB - startSrcB;
|
||||||
lenSrcA = endSrcA - startSrcA;
|
startDstA = startSrcA + startSrcB;
|
||||||
lenSrcB = endSrcB - startSrcB;
|
startDstB = startDstA + lenSrcA;
|
||||||
startDstA = startSrcA + startSrcB;
|
|
||||||
startDstB = startDstA + lenSrcA;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load main input data
|
|
||||||
cg::sync(cta);
|
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcA) {
|
|
||||||
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
|
||||||
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcB) {
|
|
||||||
s_key[threadIdx.x + SAMPLE_STRIDE] =
|
|
||||||
d_SrcKey[stride + startSrcB + threadIdx.x];
|
|
||||||
s_val[threadIdx.x + SAMPLE_STRIDE] =
|
|
||||||
d_SrcVal[stride + startSrcB + threadIdx.x];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merge data in shared memory
|
|
||||||
cg::sync(cta);
|
|
||||||
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
|
|
||||||
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
|
|
||||||
SAMPLE_STRIDE, cta);
|
|
||||||
|
|
||||||
// Store merged data
|
|
||||||
cg::sync(cta);
|
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcA) {
|
|
||||||
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
|
|
||||||
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcB) {
|
|
||||||
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
|
||||||
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
|
||||||
uint *d_LimitsA, uint *d_LimitsB,
|
|
||||||
uint stride, uint N, uint sortDir) {
|
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
|
||||||
uint mergePairs = (lastSegmentElements > stride)
|
|
||||||
? getSampleCount(N)
|
|
||||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
|
||||||
|
|
||||||
if (sortDir) {
|
|
||||||
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
|
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
|
||||||
N);
|
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
|
||||||
} else {
|
|
||||||
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
|
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
|
||||||
N);
|
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
|
||||||
uint batchSize, uint arrayLength,
|
|
||||||
uint sortDir);
|
|
||||||
|
|
||||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
|
||||||
uint *d_LimitsA,
|
|
||||||
uint *d_LimitsB, uint stride,
|
|
||||||
uint N, uint sortDir);
|
|
||||||
|
|
||||||
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
|
|
||||||
static const uint MAX_SAMPLE_COUNT = 32768;
|
|
||||||
|
|
||||||
extern "C" void initMergeSort(void) {
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" void closeMergeSort(void) {
|
|
||||||
checkCudaErrors(cudaFree(d_RanksA));
|
|
||||||
checkCudaErrors(cudaFree(d_RanksB));
|
|
||||||
checkCudaErrors(cudaFree(d_LimitsB));
|
|
||||||
checkCudaErrors(cudaFree(d_LimitsA));
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
|
|
||||||
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
|
|
||||||
uint N, uint sortDir) {
|
|
||||||
uint stageCount = 0;
|
|
||||||
|
|
||||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
|
||||||
;
|
|
||||||
|
|
||||||
uint *ikey, *ival, *okey, *oval;
|
|
||||||
|
|
||||||
if (stageCount & 1) {
|
|
||||||
ikey = d_BufKey;
|
|
||||||
ival = d_BufVal;
|
|
||||||
okey = d_DstKey;
|
|
||||||
oval = d_DstVal;
|
|
||||||
} else {
|
|
||||||
ikey = d_DstKey;
|
|
||||||
ival = d_DstVal;
|
|
||||||
okey = d_BufKey;
|
|
||||||
oval = d_BufVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
|
|
||||||
assert(N % SHARED_SIZE_LIMIT == 0);
|
|
||||||
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
|
|
||||||
SHARED_SIZE_LIMIT, sortDir);
|
|
||||||
|
|
||||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
|
||||||
|
|
||||||
// Find sample ranks and prepare for limiters merge
|
|
||||||
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
|
|
||||||
|
|
||||||
// Merge ranks and indices
|
|
||||||
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
|
|
||||||
|
|
||||||
// Merge elementary intervals
|
|
||||||
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
|
|
||||||
stride, N, sortDir);
|
|
||||||
|
|
||||||
if (lastSegmentElements <= stride) {
|
|
||||||
// Last merge segment consists of a single array which just needs to be
|
|
||||||
// passed through
|
|
||||||
checkCudaErrors(cudaMemcpy(
|
|
||||||
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
|
|
||||||
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
|
|
||||||
checkCudaErrors(cudaMemcpy(
|
|
||||||
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
|
|
||||||
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint *t;
|
// Load main input data
|
||||||
t = ikey;
|
cg::sync(cta);
|
||||||
ikey = okey;
|
|
||||||
okey = t;
|
if (threadIdx.x < lenSrcA) {
|
||||||
t = ival;
|
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
||||||
ival = oval;
|
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
||||||
oval = t;
|
}
|
||||||
}
|
|
||||||
|
if (threadIdx.x < lenSrcB) {
|
||||||
|
s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||||
|
s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Merge data in shared memory
|
||||||
|
cg::sync(cta);
|
||||||
|
merge<sortDir>(s_key,
|
||||||
|
s_val,
|
||||||
|
s_key + 0,
|
||||||
|
s_val + 0,
|
||||||
|
s_key + SAMPLE_STRIDE,
|
||||||
|
s_val + SAMPLE_STRIDE,
|
||||||
|
lenSrcA,
|
||||||
|
SAMPLE_STRIDE,
|
||||||
|
lenSrcB,
|
||||||
|
SAMPLE_STRIDE,
|
||||||
|
cta);
|
||||||
|
|
||||||
|
// Store merged data
|
||||||
|
cg::sync(cta);
|
||||||
|
|
||||||
|
if (threadIdx.x < lenSrcA) {
|
||||||
|
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
|
||||||
|
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x < lenSrcB) {
|
||||||
|
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
||||||
|
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mergeElementaryIntervals(uint *d_DstKey,
|
||||||
|
uint *d_DstVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint *d_LimitsA,
|
||||||
|
uint *d_LimitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
|
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
|
|
||||||
|
if (sortDir) {
|
||||||
|
mergeElementaryIntervalsKernel<1U>
|
||||||
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
|
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
mergeElementaryIntervalsKernel<0U>
|
||||||
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
|
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" void bitonicSortShared(uint *d_DstKey,
|
||||||
|
uint *d_DstVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint batchSize,
|
||||||
|
uint arrayLength,
|
||||||
|
uint sortDir);
|
||||||
|
|
||||||
|
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
|
||||||
|
uint *d_DstVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint *d_LimitsA,
|
||||||
|
uint *d_LimitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir);
|
||||||
|
|
||||||
|
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
|
||||||
|
static const uint MAX_SAMPLE_COUNT = 32768;
|
||||||
|
|
||||||
|
extern "C" void initMergeSort(void)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" void closeMergeSort(void)
|
||||||
|
{
|
||||||
|
checkCudaErrors(cudaFree(d_RanksA));
|
||||||
|
checkCudaErrors(cudaFree(d_RanksB));
|
||||||
|
checkCudaErrors(cudaFree(d_LimitsB));
|
||||||
|
checkCudaErrors(cudaFree(d_LimitsA));
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" void mergeSort(uint *d_DstKey,
|
||||||
|
uint *d_DstVal,
|
||||||
|
uint *d_BufKey,
|
||||||
|
uint *d_BufVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
|
uint stageCount = 0;
|
||||||
|
|
||||||
|
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||||
|
;
|
||||||
|
|
||||||
|
uint *ikey, *ival, *okey, *oval;
|
||||||
|
|
||||||
|
if (stageCount & 1) {
|
||||||
|
ikey = d_BufKey;
|
||||||
|
ival = d_BufVal;
|
||||||
|
okey = d_DstKey;
|
||||||
|
oval = d_DstVal;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ikey = d_DstKey;
|
||||||
|
ival = d_DstVal;
|
||||||
|
okey = d_BufKey;
|
||||||
|
oval = d_BufVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
|
||||||
|
assert(N % SHARED_SIZE_LIMIT == 0);
|
||||||
|
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
|
||||||
|
|
||||||
|
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||||
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
|
|
||||||
|
// Find sample ranks and prepare for limiters merge
|
||||||
|
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
|
||||||
|
|
||||||
|
// Merge ranks and indices
|
||||||
|
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
|
||||||
|
|
||||||
|
// Merge elementary intervals
|
||||||
|
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
|
||||||
|
|
||||||
|
if (lastSegmentElements <= stride) {
|
||||||
|
// Last merge segment consists of a single array which just needs to be
|
||||||
|
// passed through
|
||||||
|
checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
|
||||||
|
ikey + (N - lastSegmentElements),
|
||||||
|
lastSegmentElements * sizeof(uint),
|
||||||
|
cudaMemcpyDeviceToDevice));
|
||||||
|
checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
|
||||||
|
ival + (N - lastSegmentElements),
|
||||||
|
lastSegmentElements * sizeof(uint),
|
||||||
|
cudaMemcpyDeviceToDevice));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint *t;
|
||||||
|
t = ikey;
|
||||||
|
ikey = okey;
|
||||||
|
okey = t;
|
||||||
|
t = ival;
|
||||||
|
ival = oval;
|
||||||
|
oval = t;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,19 +31,17 @@
|
|||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
|
|
||||||
#define SHARED_SIZE_LIMIT 1024U
|
#define SHARED_SIZE_LIMIT 1024U
|
||||||
#define SAMPLE_STRIDE 128
|
#define SAMPLE_STRIDE 128
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Extensive sort validation routine
|
// Extensive sort validation routine
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
extern "C" uint
|
||||||
uint arrayLength, uint numValues,
|
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
|
||||||
uint sortDir);
|
|
||||||
|
|
||||||
extern "C" void fillValues(uint *val, uint N);
|
extern "C" void fillValues(uint *val, uint N);
|
||||||
|
|
||||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
|
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
|
||||||
uint batchSize, uint arrayLength);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// CUDA merge sort
|
// CUDA merge sort
|
||||||
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
|
|||||||
|
|
||||||
extern "C" void closeMergeSort(void);
|
extern "C" void closeMergeSort(void);
|
||||||
|
|
||||||
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
|
extern "C" void
|
||||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
|
||||||
uint sortDir);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// CPU "emulation"
|
// CPU "emulation"
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
extern "C" void
|
||||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
|
||||||
uint sortDir);
|
|
||||||
|
@ -29,329 +29,335 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Helper functions
|
// Helper functions
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void checkOrder(uint *data, uint N, uint sortDir) {
|
static void checkOrder(uint *data, uint N, uint sortDir)
|
||||||
if (N <= 1) {
|
{
|
||||||
return;
|
if (N <= 1) {
|
||||||
}
|
return;
|
||||||
|
|
||||||
for (uint i = 0; i < N - 1; i++)
|
|
||||||
if ((sortDir && (data[i] > data[i + 1])) ||
|
|
||||||
(!sortDir && (data[i] < data[i + 1]))) {
|
|
||||||
fprintf(stderr, "checkOrder() failed!!!\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (uint i = 0; i < N - 1; i++)
|
||||||
|
if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
|
||||||
|
fprintf(stderr, "checkOrder() failed!!!\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
|
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
|
||||||
|
|
||||||
static uint getSampleCount(uint dividend) {
|
static uint getSampleCount(uint dividend)
|
||||||
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
|
{
|
||||||
: (dividend / SAMPLE_STRIDE);
|
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint nextPowerOfTwo(uint x) {
|
static uint nextPowerOfTwo(uint x)
|
||||||
--x;
|
{
|
||||||
x |= x >> 1;
|
--x;
|
||||||
x |= x >> 2;
|
x |= x >> 1;
|
||||||
x |= x >> 4;
|
x |= x >> 2;
|
||||||
x |= x >> 8;
|
x |= x >> 4;
|
||||||
x |= x >> 16;
|
x |= x >> 8;
|
||||||
return ++x;
|
x |= x >> 16;
|
||||||
|
return ++x;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
|
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
|
||||||
if (L == 0) {
|
{
|
||||||
return 0;
|
if (L == 0) {
|
||||||
}
|
return 0;
|
||||||
|
|
||||||
uint pos = 0;
|
|
||||||
|
|
||||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
|
||||||
uint newPos = umin(pos + stride, L);
|
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] <= val)) ||
|
|
||||||
(!sortDir && (data[newPos - 1] >= val))) {
|
|
||||||
pos = newPos;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return pos;
|
uint pos = 0;
|
||||||
|
|
||||||
|
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||||
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
|
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
|
||||||
|
pos = newPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
|
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
|
||||||
if (L == 0) {
|
{
|
||||||
return 0;
|
if (L == 0) {
|
||||||
}
|
return 0;
|
||||||
|
|
||||||
uint pos = 0;
|
|
||||||
|
|
||||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
|
||||||
uint newPos = umin(pos + stride, L);
|
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] < val)) ||
|
|
||||||
(!sortDir && (data[newPos - 1] > val))) {
|
|
||||||
pos = newPos;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return pos;
|
uint pos = 0;
|
||||||
|
|
||||||
|
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||||
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
|
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
|
||||||
|
pos = newPos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 1: find sample ranks in each segment
|
// Merge step 1: find sample ranks in each segment
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
|
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
|
||||||
uint stride, uint N, uint sortDir) {
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint sampleCount =
|
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
|
||||||
|
|
||||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
const uint lenA = stride;
|
const uint lenA = stride;
|
||||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||||
const uint nA = stride / SAMPLE_STRIDE;
|
const uint nA = stride / SAMPLE_STRIDE;
|
||||||
const uint nB = getSampleCount(lenB);
|
const uint nB = getSampleCount(lenB);
|
||||||
|
|
||||||
if (i < nA) {
|
if (i < nA) {
|
||||||
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||||
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
|
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
|
||||||
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
|
srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
|
||||||
srcKey + segmentBase + stride, lenB, sortDir);
|
}
|
||||||
|
|
||||||
|
if (i < nB) {
|
||||||
|
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||||
|
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
|
||||||
|
srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < nB) {
|
|
||||||
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
|
||||||
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
|
|
||||||
binarySearchInclusive(
|
|
||||||
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
|
|
||||||
srcKey + segmentBase, lenA, sortDir);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 2: merge ranks and indices to derive elementary intervals
|
// Merge step 2: merge ranks and indices to derive elementary intervals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
|
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
|
||||||
uint N) {
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint sampleCount =
|
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
|
||||||
|
|
||||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
const uint lenA = stride;
|
const uint lenA = stride;
|
||||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||||
const uint nA = stride / SAMPLE_STRIDE;
|
const uint nA = stride / SAMPLE_STRIDE;
|
||||||
const uint nB = getSampleCount(lenB);
|
const uint nB = getSampleCount(lenB);
|
||||||
|
|
||||||
if (i < nA) {
|
if (i < nA) {
|
||||||
uint dstPosA =
|
uint dstPosA =
|
||||||
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
|
binarySearchExclusive(
|
||||||
ranks + (segmentBase + stride) / SAMPLE_STRIDE,
|
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
|
||||||
nB, 1) +
|
+ i;
|
||||||
i;
|
assert(dstPosA < nA + nB);
|
||||||
assert(dstPosA < nA + nB);
|
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
|
||||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
|
}
|
||||||
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
|
|
||||||
|
if (i < nB) {
|
||||||
|
uint dstPosA =
|
||||||
|
binarySearchInclusive(
|
||||||
|
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
|
||||||
|
+ i;
|
||||||
|
assert(dstPosA < nA + nB);
|
||||||
|
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < nB) {
|
|
||||||
uint dstPosA = binarySearchInclusive(
|
|
||||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
|
|
||||||
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
|
|
||||||
i;
|
|
||||||
assert(dstPosA < nA + nB);
|
|
||||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
|
|
||||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
|
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
|
static void merge(uint *dstKey,
|
||||||
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
|
uint *dstVal,
|
||||||
uint sortDir) {
|
uint *srcAKey,
|
||||||
checkOrder(srcAKey, lenA, sortDir);
|
uint *srcAVal,
|
||||||
checkOrder(srcBKey, lenB, sortDir);
|
uint *srcBKey,
|
||||||
|
uint *srcBVal,
|
||||||
|
uint lenA,
|
||||||
|
uint lenB,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
|
checkOrder(srcAKey, lenA, sortDir);
|
||||||
|
checkOrder(srcBKey, lenB, sortDir);
|
||||||
|
|
||||||
for (uint i = 0; i < lenA; i++) {
|
for (uint i = 0; i < lenA; i++) {
|
||||||
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
|
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
|
||||||
assert(dstPos < lenA + lenB);
|
assert(dstPos < lenA + lenB);
|
||||||
dstKey[dstPos] = srcAKey[i];
|
dstKey[dstPos] = srcAKey[i];
|
||||||
dstVal[dstPos] = srcAVal[i];
|
dstVal[dstPos] = srcAVal[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint i = 0; i < lenB; i++) {
|
for (uint i = 0; i < lenB; i++) {
|
||||||
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
|
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
|
||||||
assert(dstPos < lenA + lenB);
|
assert(dstPos < lenA + lenB);
|
||||||
dstKey[dstPos] = srcBKey[i];
|
dstKey[dstPos] = srcBKey[i];
|
||||||
dstVal[dstPos] = srcBVal[i];
|
dstVal[dstPos] = srcBVal[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
|
static void mergeElementaryIntervals(uint *dstKey,
|
||||||
uint *srcVal, uint *limitsA, uint *limitsB,
|
uint *dstVal,
|
||||||
uint stride, uint N, uint sortDir) {
|
uint *srcKey,
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint *srcVal,
|
||||||
uint mergePairs = (lastSegmentElements > stride)
|
uint *limitsA,
|
||||||
? getSampleCount(N)
|
uint *limitsB,
|
||||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
|
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
|
|
||||||
for (uint pos = 0; pos < mergePairs; pos++) {
|
for (uint pos = 0; pos < mergePairs; pos++) {
|
||||||
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
|
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||||
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
|
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
|
||||||
|
|
||||||
const uint lenA = stride;
|
const uint lenA = stride;
|
||||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||||
const uint nA = stride / SAMPLE_STRIDE;
|
const uint nA = stride / SAMPLE_STRIDE;
|
||||||
const uint nB = getSampleCount(lenB);
|
const uint nB = getSampleCount(lenB);
|
||||||
const uint n = nA + nB;
|
const uint n = nA + nB;
|
||||||
|
|
||||||
const uint startPosA = limitsA[pos];
|
const uint startPosA = limitsA[pos];
|
||||||
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
|
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
|
||||||
const uint startPosB = limitsB[pos];
|
const uint startPosB = limitsB[pos];
|
||||||
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
|
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
|
||||||
const uint startPosDst = startPosA + startPosB;
|
const uint startPosDst = startPosA + startPosB;
|
||||||
|
|
||||||
assert(startPosA <= endPosA && endPosA <= lenA);
|
assert(startPosA <= endPosA && endPosA <= lenA);
|
||||||
assert(startPosB <= endPosB && endPosB <= lenB);
|
assert(startPosB <= endPosB && endPosB <= lenB);
|
||||||
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
|
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
|
||||||
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
|
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
|
||||||
|
|
||||||
merge(dstKey + segmentBase + startPosDst,
|
merge(dstKey + segmentBase + startPosDst,
|
||||||
dstVal + segmentBase + startPosDst,
|
dstVal + segmentBase + startPosDst,
|
||||||
(srcKey + segmentBase + 0) + startPosA,
|
(srcKey + segmentBase + 0) + startPosA,
|
||||||
(srcVal + segmentBase + 0) + startPosA,
|
(srcVal + segmentBase + 0) + startPosA,
|
||||||
(srcKey + segmentBase + stride) + startPosB,
|
(srcKey + segmentBase + stride) + startPosB,
|
||||||
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
|
(srcVal + segmentBase + stride) + startPosB,
|
||||||
endPosB - startPosB, sortDir);
|
endPosA - startPosA,
|
||||||
}
|
endPosB - startPosB,
|
||||||
|
sortDir);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Retarded bubble sort
|
// Retarded bubble sort
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
|
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
|
||||||
if (N <= 1) {
|
{
|
||||||
return;
|
if (N <= 1) {
|
||||||
}
|
return;
|
||||||
|
}
|
||||||
for (uint bottom = 0; bottom < N - 1; bottom++) {
|
|
||||||
uint savePos = bottom;
|
for (uint bottom = 0; bottom < N - 1; bottom++) {
|
||||||
uint saveKey = key[bottom];
|
uint savePos = bottom;
|
||||||
|
uint saveKey = key[bottom];
|
||||||
for (uint i = bottom + 1; i < N; i++)
|
|
||||||
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
|
for (uint i = bottom + 1; i < N; i++)
|
||||||
savePos = i;
|
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
|
||||||
saveKey = key[i];
|
savePos = i;
|
||||||
}
|
saveKey = key[i];
|
||||||
|
}
|
||||||
if (savePos != bottom) {
|
|
||||||
uint t;
|
if (savePos != bottom) {
|
||||||
t = key[savePos];
|
uint t;
|
||||||
key[savePos] = key[bottom];
|
t = key[savePos];
|
||||||
key[bottom] = t;
|
key[savePos] = key[bottom];
|
||||||
t = val[savePos];
|
key[bottom] = t;
|
||||||
val[savePos] = val[bottom];
|
t = val[savePos];
|
||||||
val[bottom] = t;
|
val[savePos] = val[bottom];
|
||||||
|
val[bottom] = t;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Interface function
|
// Interface function
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
extern "C" void
|
||||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
|
||||||
uint sortDir) {
|
{
|
||||||
uint *ikey, *ival, *okey, *oval;
|
uint *ikey, *ival, *okey, *oval;
|
||||||
uint stageCount = 0;
|
uint stageCount = 0;
|
||||||
|
|
||||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||||
;
|
;
|
||||||
|
|
||||||
if (stageCount & 1) {
|
if (stageCount & 1) {
|
||||||
ikey = bufKey;
|
ikey = bufKey;
|
||||||
ival = bufVal;
|
ival = bufVal;
|
||||||
okey = dstKey;
|
okey = dstKey;
|
||||||
oval = dstVal;
|
oval = dstVal;
|
||||||
} else {
|
}
|
||||||
ikey = dstKey;
|
else {
|
||||||
ival = dstVal;
|
ikey = dstKey;
|
||||||
okey = bufKey;
|
ival = dstVal;
|
||||||
oval = bufVal;
|
okey = bufKey;
|
||||||
}
|
oval = bufVal;
|
||||||
|
|
||||||
printf("Bottom-level sort...\n");
|
|
||||||
memcpy(ikey, srcKey, N * sizeof(uint));
|
|
||||||
memcpy(ival, srcVal, N * sizeof(uint));
|
|
||||||
|
|
||||||
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
|
|
||||||
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
|
|
||||||
sortDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("Merge...\n");
|
|
||||||
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
|
||||||
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
|
||||||
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
|
||||||
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
|
||||||
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
|
|
||||||
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
|
|
||||||
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
|
|
||||||
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
|
|
||||||
|
|
||||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
|
||||||
|
|
||||||
// Find sample ranks and prepare for limiters merge
|
|
||||||
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
|
|
||||||
|
|
||||||
// Merge ranks and indices
|
|
||||||
mergeRanksAndIndices(limitsA, ranksA, stride, N);
|
|
||||||
mergeRanksAndIndices(limitsB, ranksB, stride, N);
|
|
||||||
|
|
||||||
// Merge elementary intervals
|
|
||||||
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
|
|
||||||
N, sortDir);
|
|
||||||
|
|
||||||
if (lastSegmentElements <= stride) {
|
|
||||||
// Last merge segment consists of a single array which just needs to be
|
|
||||||
// passed through
|
|
||||||
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
|
|
||||||
lastSegmentElements * sizeof(uint));
|
|
||||||
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
|
|
||||||
lastSegmentElements * sizeof(uint));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
uint *t;
|
printf("Bottom-level sort...\n");
|
||||||
t = ikey;
|
memcpy(ikey, srcKey, N * sizeof(uint));
|
||||||
ikey = okey;
|
memcpy(ival, srcVal, N * sizeof(uint));
|
||||||
okey = t;
|
|
||||||
t = ival;
|
|
||||||
ival = oval;
|
|
||||||
oval = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
free(limitsB);
|
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
|
||||||
free(limitsA);
|
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
|
||||||
free(ranksB);
|
}
|
||||||
free(ranksA);
|
|
||||||
|
printf("Merge...\n");
|
||||||
|
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||||
|
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||||
|
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||||
|
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||||
|
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||||
|
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||||
|
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||||
|
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||||
|
|
||||||
|
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||||
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
|
|
||||||
|
// Find sample ranks and prepare for limiters merge
|
||||||
|
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
|
||||||
|
|
||||||
|
// Merge ranks and indices
|
||||||
|
mergeRanksAndIndices(limitsA, ranksA, stride, N);
|
||||||
|
mergeRanksAndIndices(limitsB, ranksB, stride, N);
|
||||||
|
|
||||||
|
// Merge elementary intervals
|
||||||
|
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
|
||||||
|
|
||||||
|
if (lastSegmentElements <= stride) {
|
||||||
|
// Last merge segment consists of a single array which just needs to be
|
||||||
|
// passed through
|
||||||
|
memcpy(
|
||||||
|
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
|
||||||
|
memcpy(
|
||||||
|
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint *t;
|
||||||
|
t = ikey;
|
||||||
|
ikey = okey;
|
||||||
|
okey = t;
|
||||||
|
t = ival;
|
||||||
|
ival = oval;
|
||||||
|
oval = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(limitsB);
|
||||||
|
free(limitsA);
|
||||||
|
free(ranksB);
|
||||||
|
free(ranksA);
|
||||||
}
|
}
|
||||||
|
@ -29,104 +29,100 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Validate sorted keys array (check for integrity and proper order)
|
// Validate sorted keys array (check for integrity and proper order)
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
extern "C" uint
|
||||||
uint arrayLength, uint numValues,
|
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
|
||||||
uint sortDir) {
|
{
|
||||||
uint *srcHist;
|
uint *srcHist;
|
||||||
uint *resHist;
|
uint *resHist;
|
||||||
|
|
||||||
if (arrayLength < 2) {
|
if (arrayLength < 2) {
|
||||||
printf("validateSortedKeys(): arrays too short, exiting...\n");
|
printf("validateSortedKeys(): arrays too short, exiting...\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
|
||||||
|
|
||||||
printf("...inspecting keys array: ");
|
|
||||||
srcHist = (uint *)malloc(numValues * sizeof(uint));
|
|
||||||
resHist = (uint *)malloc(numValues * sizeof(uint));
|
|
||||||
|
|
||||||
int flag = 1;
|
|
||||||
|
|
||||||
for (uint j = 0; j < batchSize;
|
|
||||||
j++, srcKey += arrayLength, resKey += arrayLength) {
|
|
||||||
// Build histograms for keys arrays
|
|
||||||
memset(srcHist, 0, numValues * sizeof(uint));
|
|
||||||
memset(resHist, 0, numValues * sizeof(uint));
|
|
||||||
|
|
||||||
for (uint i = 0; i < arrayLength; i++) {
|
|
||||||
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
|
|
||||||
srcHist[srcKey[i]]++;
|
|
||||||
resHist[resKey[i]]++;
|
|
||||||
} else {
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"***Set %u source/result key arrays are not limited properly***\n",
|
|
||||||
j);
|
|
||||||
flag = 0;
|
|
||||||
goto brk;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compare the histograms
|
printf("...inspecting keys array: ");
|
||||||
for (uint i = 0; i < numValues; i++)
|
srcHist = (uint *)malloc(numValues * sizeof(uint));
|
||||||
if (srcHist[i] != resHist[i]) {
|
resHist = (uint *)malloc(numValues * sizeof(uint));
|
||||||
fprintf(stderr,
|
|
||||||
"***Set %u source/result keys histograms do not match***\n", j);
|
|
||||||
flag = 0;
|
|
||||||
goto brk;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Finally check the ordering
|
int flag = 1;
|
||||||
for (uint i = 0; i < arrayLength - 1; i++)
|
|
||||||
if ((sortDir && (resKey[i] > resKey[i + 1])) ||
|
for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
|
||||||
(!sortDir && (resKey[i] < resKey[i + 1]))) {
|
// Build histograms for keys arrays
|
||||||
fprintf(stderr,
|
memset(srcHist, 0, numValues * sizeof(uint));
|
||||||
"***Set %u result key array is not ordered properly***\n", j);
|
memset(resHist, 0, numValues * sizeof(uint));
|
||||||
flag = 0;
|
|
||||||
goto brk;
|
for (uint i = 0; i < arrayLength; i++) {
|
||||||
}
|
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
|
||||||
}
|
srcHist[srcKey[i]]++;
|
||||||
|
resHist[resKey[i]]++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
|
||||||
|
flag = 0;
|
||||||
|
goto brk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare the histograms
|
||||||
|
for (uint i = 0; i < numValues; i++)
|
||||||
|
if (srcHist[i] != resHist[i]) {
|
||||||
|
fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
|
||||||
|
flag = 0;
|
||||||
|
goto brk;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally check the ordering
|
||||||
|
for (uint i = 0; i < arrayLength - 1; i++)
|
||||||
|
if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
|
||||||
|
fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
|
||||||
|
flag = 0;
|
||||||
|
goto brk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
brk:
|
brk:
|
||||||
free(resHist);
|
free(resHist);
|
||||||
free(srcHist);
|
free(srcHist);
|
||||||
|
|
||||||
if (flag) printf("OK\n");
|
if (flag)
|
||||||
|
printf("OK\n");
|
||||||
|
|
||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Value validation / stability check routines
|
// Value validation / stability check routines
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void fillValues(uint *val, uint N) {
|
extern "C" void fillValues(uint *val, uint N)
|
||||||
for (uint i = 0; i < N; i++) val[i] = i;
|
{
|
||||||
|
for (uint i = 0; i < N; i++)
|
||||||
|
val[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
|
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
|
||||||
uint batchSize, uint arrayLength) {
|
{
|
||||||
int correctFlag = 1, stableFlag = 1;
|
int correctFlag = 1, stableFlag = 1;
|
||||||
|
|
||||||
printf("...inspecting keys and values array: ");
|
printf("...inspecting keys and values array: ");
|
||||||
|
|
||||||
for (uint i = 0; i < batchSize;
|
for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
|
||||||
i++, resKey += arrayLength, resVal += arrayLength) {
|
for (uint j = 0; j < arrayLength; j++) {
|
||||||
for (uint j = 0; j < arrayLength; j++) {
|
if (resKey[j] != srcKey[resVal[j]])
|
||||||
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
|
correctFlag = 0;
|
||||||
|
|
||||||
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
|
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
|
||||||
(resVal[j] > resVal[j + 1]))
|
stableFlag = 0;
|
||||||
stableFlag = 0;
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
|
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
|
||||||
printf(stableFlag ? "...stability property: stable!\n"
|
printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
|
||||||
: "...stability property: NOT stable\n");
|
|
||||||
|
|
||||||
return correctFlag;
|
return correctFlag;
|
||||||
}
|
}
|
||||||
|
@ -11,8 +11,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
set(CMAKE_CUDA_ARCHITECTURES 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
|
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -30,7 +30,7 @@ cudaStreamCreateWithFlags, cudaFree, cudaDeviceGetAttribute, cudaMallocHost, cud
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -29,106 +29,105 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// Includes CUDA
|
// Includes CUDA
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda/barrier>
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
#include <cuda/barrier>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||||
|
|
||||||
// CUDA helper functions
|
// CUDA helper functions
|
||||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||||
|
|
||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
|
|
||||||
#if __CUDA_ARCH__ >= 700
|
#if __CUDA_ARCH__ >= 700
|
||||||
template <bool writeSquareRoot>
|
template <bool writeSquareRoot>
|
||||||
__device__ void reduceBlockData(
|
__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
|
||||||
cuda::barrier<cuda::thread_scope_block> &barrier,
|
cg::thread_block_tile<32> &tile32,
|
||||||
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
|
double &threadSum,
|
||||||
extern __shared__ double tmp[];
|
double *result)
|
||||||
|
{
|
||||||
#pragma unroll
|
extern __shared__ double tmp[];
|
||||||
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
|
||||||
threadSum += tile32.shfl_down(threadSum, offset);
|
|
||||||
}
|
|
||||||
if (tile32.thread_rank() == 0) {
|
|
||||||
tmp[tile32.meta_group_rank()] = threadSum;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto token = barrier.arrive();
|
|
||||||
|
|
||||||
barrier.wait(std::move(token));
|
|
||||||
|
|
||||||
// The warp 0 will perform last round of reduction
|
|
||||||
if (tile32.meta_group_rank() == 0) {
|
|
||||||
double beta = tile32.thread_rank() < tile32.meta_group_size()
|
|
||||||
? tmp[tile32.thread_rank()]
|
|
||||||
: 0.0;
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
||||||
beta += tile32.shfl_down(beta, offset);
|
threadSum += tile32.shfl_down(threadSum, offset);
|
||||||
|
}
|
||||||
|
if (tile32.thread_rank() == 0) {
|
||||||
|
tmp[tile32.meta_group_rank()] = threadSum;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tile32.thread_rank() == 0) {
|
auto token = barrier.arrive();
|
||||||
if (writeSquareRoot)
|
|
||||||
*result = sqrt(beta);
|
barrier.wait(std::move(token));
|
||||||
else
|
|
||||||
*result = beta;
|
// The warp 0 will perform last round of reduction
|
||||||
|
if (tile32.meta_group_rank() == 0) {
|
||||||
|
double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
||||||
|
beta += tile32.shfl_down(beta, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tile32.thread_rank() == 0) {
|
||||||
|
if (writeSquareRoot)
|
||||||
|
*result = sqrt(beta);
|
||||||
|
else
|
||||||
|
*result = beta;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
|
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
|
||||||
double *partialResults, int size) {
|
{
|
||||||
#if __CUDA_ARCH__ >= 700
|
#if __CUDA_ARCH__ >= 700
|
||||||
#pragma diag_suppress static_var_with_dynamic_init
|
#pragma diag_suppress static_var_with_dynamic_init
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
cg::grid_group grid = cg::this_grid();
|
cg::grid_group grid = cg::this_grid();
|
||||||
;
|
;
|
||||||
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
||||||
|
|
||||||
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
|
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
init(&barrier, blockDim.x);
|
init(&barrier, blockDim.x);
|
||||||
}
|
|
||||||
|
|
||||||
cg::sync(cta);
|
|
||||||
|
|
||||||
double threadSum = 0.0;
|
|
||||||
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
|
||||||
threadSum += (double)(vecA[i] * vecB[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Each thread block performs reduction of partial dotProducts and writes to
|
|
||||||
// global mem.
|
|
||||||
reduceBlockData<false>(barrier, tile32, threadSum,
|
|
||||||
&partialResults[blockIdx.x]);
|
|
||||||
|
|
||||||
cg::sync(grid);
|
|
||||||
|
|
||||||
// One block performs the final summation of partial dot products
|
|
||||||
// of all the thread blocks and writes the sqrt of final dot product.
|
|
||||||
if (blockIdx.x == 0) {
|
|
||||||
threadSum = 0.0;
|
|
||||||
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
|
|
||||||
threadSum += partialResults[i];
|
|
||||||
}
|
}
|
||||||
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
|
|
||||||
}
|
|
||||||
|
|
||||||
cg::sync(grid);
|
cg::sync(cta);
|
||||||
|
|
||||||
const double finalValue = partialResults[0];
|
double threadSum = 0.0;
|
||||||
|
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||||
|
threadSum += (double)(vecA[i] * vecB[i]);
|
||||||
|
}
|
||||||
|
|
||||||
// Perform normalization of vecA & vecB.
|
// Each thread block performs reduction of partial dotProducts and writes to
|
||||||
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
// global mem.
|
||||||
vecA[i] = (float)vecA[i] / finalValue;
|
reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
|
||||||
vecB[i] = (float)vecB[i] / finalValue;
|
|
||||||
}
|
cg::sync(grid);
|
||||||
|
|
||||||
|
// One block performs the final summation of partial dot products
|
||||||
|
// of all the thread blocks and writes the sqrt of final dot product.
|
||||||
|
if (blockIdx.x == 0) {
|
||||||
|
threadSum = 0.0;
|
||||||
|
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
|
||||||
|
threadSum += partialResults[i];
|
||||||
|
}
|
||||||
|
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
cg::sync(grid);
|
||||||
|
|
||||||
|
const double finalValue = partialResults[0];
|
||||||
|
|
||||||
|
// Perform normalization of vecA & vecB.
|
||||||
|
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||||
|
vecA[i] = (float)vecA[i] / finalValue;
|
||||||
|
vecB[i] = (float)vecB[i] / finalValue;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("%s starting...\n", argv[0]);
|
{
|
||||||
|
printf("%s starting...\n", argv[0]);
|
||||||
|
|
||||||
// This will pick the best possible CUDA capable device
|
// This will pick the best possible CUDA capable device
|
||||||
int dev = findCudaDevice(argc, (const char **)argv);
|
int dev = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
int major = 0;
|
int major = 0;
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
||||||
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
|
||||||
|
|
||||||
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
|
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
|
||||||
if (major < 7) {
|
if (major < 7) {
|
||||||
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
|
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
|
||||||
|
|
||||||
int supportsCooperativeLaunch = 0;
|
|
||||||
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
|
|
||||||
cudaDevAttrCooperativeLaunch, dev));
|
|
||||||
|
|
||||||
if (!supportsCooperativeLaunch) {
|
|
||||||
printf(
|
|
||||||
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
|
|
||||||
"Waiving the run\n",
|
|
||||||
dev);
|
|
||||||
exit(EXIT_WAIVED);
|
|
||||||
}
|
|
||||||
|
|
||||||
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
|
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
|
|
||||||
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
|
|
||||||
float *vecA, *d_vecA;
|
|
||||||
float *vecB, *d_vecB;
|
|
||||||
double *d_partialResults;
|
|
||||||
int size = 10000000;
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
|
|
||||||
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
|
|
||||||
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
|
|
||||||
|
|
||||||
float baseVal = 2.0;
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
vecA[i] = vecB[i] = baseVal;
|
|
||||||
}
|
|
||||||
|
|
||||||
cudaStream_t stream;
|
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// Kernel configuration, where a one-dimensional
|
|
||||||
// grid and one-dimensional blocks are configured.
|
|
||||||
int minGridSize = 0, blockSize = 0;
|
|
||||||
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
|
|
||||||
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
|
|
||||||
|
|
||||||
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
|
|
||||||
|
|
||||||
int numBlocksPerSm = 0;
|
|
||||||
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
|
||||||
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
|
|
||||||
|
|
||||||
int multiProcessorCount = 0;
|
|
||||||
checkCudaErrors(cudaDeviceGetAttribute(
|
|
||||||
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
|
|
||||||
|
|
||||||
minGridSize = multiProcessorCount * numBlocksPerSm;
|
|
||||||
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
|
|
||||||
|
|
||||||
printf(
|
|
||||||
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
|
|
||||||
"blockSize = %d\n",
|
|
||||||
minGridSize, blockSize);
|
|
||||||
|
|
||||||
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
|
|
||||||
|
|
||||||
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
|
|
||||||
(void *)&d_partialResults, (void *)&size};
|
|
||||||
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
|
|
||||||
dimBlock, kernelArgs, smemSize, stream));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
|
|
||||||
cudaMemcpyDeviceToHost, stream));
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
||||||
|
|
||||||
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
|
|
||||||
unsigned int matches = 0;
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
if ((vecA[i] - expectedResult) > 0.00001) {
|
|
||||||
printf("mismatch at i = %d\n", i);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
matches++;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
|
int supportsCooperativeLaunch = 0;
|
||||||
checkCudaErrors(cudaFree(d_vecA));
|
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
|
||||||
checkCudaErrors(cudaFree(d_vecB));
|
|
||||||
checkCudaErrors(cudaFree(d_partialResults));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaFreeHost(vecA));
|
if (!supportsCooperativeLaunch) {
|
||||||
checkCudaErrors(cudaFreeHost(vecB));
|
printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
|
||||||
return matches == size;
|
"Waiving the run\n",
|
||||||
|
dev);
|
||||||
|
exit(EXIT_WAIVED);
|
||||||
|
}
|
||||||
|
|
||||||
|
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
|
||||||
|
|
||||||
|
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
|
||||||
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
|
||||||
|
{
|
||||||
|
float *vecA, *d_vecA;
|
||||||
|
float *vecB, *d_vecB;
|
||||||
|
double *d_partialResults;
|
||||||
|
int size = 10000000;
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
|
||||||
|
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
|
||||||
|
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
|
||||||
|
|
||||||
|
float baseVal = 2.0;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
vecA[i] = vecB[i] = baseVal;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaStream_t stream;
|
||||||
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
|
||||||
|
|
||||||
|
// Kernel configuration, where a one-dimensional
|
||||||
|
// grid and one-dimensional blocks are configured.
|
||||||
|
int minGridSize = 0, blockSize = 0;
|
||||||
|
checkCudaErrors(
|
||||||
|
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
|
||||||
|
|
||||||
|
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
|
||||||
|
|
||||||
|
int numBlocksPerSm = 0;
|
||||||
|
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||||
|
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
|
||||||
|
|
||||||
|
int multiProcessorCount = 0;
|
||||||
|
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
|
||||||
|
|
||||||
|
minGridSize = multiProcessorCount * numBlocksPerSm;
|
||||||
|
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
|
||||||
|
|
||||||
|
printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
|
||||||
|
"blockSize = %d\n",
|
||||||
|
minGridSize,
|
||||||
|
blockSize);
|
||||||
|
|
||||||
|
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
|
||||||
|
|
||||||
|
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
|
||||||
|
|
||||||
|
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||||
|
(void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
|
||||||
|
unsigned int matches = 0;
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
if ((vecA[i] - expectedResult) > 0.00001) {
|
||||||
|
printf("mismatch at i = %d\n", i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
matches++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
|
||||||
|
checkCudaErrors(cudaFree(d_vecA));
|
||||||
|
checkCudaErrors(cudaFree(d_vecB));
|
||||||
|
checkCudaErrors(cudaFree(d_partialResults));
|
||||||
|
|
||||||
|
checkCudaErrors(cudaFreeHost(vecA));
|
||||||
|
checkCudaErrors(cudaFreeHost(vecB));
|
||||||
|
return matches == size;
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Removes -DNDEBUG For Print specific logs in this sample.
|
# Removes -DNDEBUG For Print specific logs in this sample.
|
||||||
|
@ -27,6 +27,6 @@ cudaDeviceSynchronize, cudaGetErrorString
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -34,17 +34,17 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Includes, system
|
// Includes, system
|
||||||
#include <stdio.h>
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// Includes CUDA
|
// Includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||||
|
|
||||||
// CUDA helper functions
|
// CUDA helper functions
|
||||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||||
|
|
||||||
const char *sampleName = "simpleAssert";
|
const char *sampleName = "simpleAssert";
|
||||||
|
|
||||||
@ -58,9 +58,10 @@ bool testResult = true;
|
|||||||
//! Tests assert function.
|
//! Tests assert function.
|
||||||
//! Thread whose id > N will print assertion failed error message.
|
//! Thread whose id > N will print assertion failed error message.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void testKernel(int N) {
|
__global__ void testKernel(int N)
|
||||||
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
{
|
||||||
assert(gtid < N);
|
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
assert(gtid < N);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("%s starting...\n", sampleName);
|
{
|
||||||
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
int Nblocks = 2;
|
{
|
||||||
int Nthreads = 32;
|
int Nblocks = 2;
|
||||||
cudaError_t error;
|
int Nthreads = 32;
|
||||||
|
cudaError_t error;
|
||||||
|
|
||||||
#ifndef _WIN32
|
#ifndef _WIN32
|
||||||
utsname OS_System_Type;
|
utsname OS_System_Type;
|
||||||
uname(&OS_System_Type);
|
uname(&OS_System_Type);
|
||||||
|
|
||||||
printf("OS_System_Type.release = %s\n", OS_System_Type.release);
|
printf("OS_System_Type.release = %s\n", OS_System_Type.release);
|
||||||
|
|
||||||
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
|
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
|
||||||
printf("simpleAssert is not current supported on Mac OSX\n\n");
|
printf("simpleAssert is not current supported on Mac OSX\n\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
} else {
|
}
|
||||||
printf("OS Info: <%s>\n\n", OS_System_Type.version);
|
else {
|
||||||
}
|
printf("OS Info: <%s>\n\n", OS_System_Type.version);
|
||||||
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// This will pick the best possible CUDA capable device
|
// This will pick the best possible CUDA capable device
|
||||||
findCudaDevice(argc, (const char **)argv);
|
findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
// Kernel configuration, where a one-dimensional
|
// Kernel configuration, where a one-dimensional
|
||||||
// grid and one-dimensional blocks are configured.
|
// grid and one-dimensional blocks are configured.
|
||||||
dim3 dimGrid(Nblocks);
|
dim3 dimGrid(Nblocks);
|
||||||
dim3 dimBlock(Nthreads);
|
dim3 dimBlock(Nthreads);
|
||||||
|
|
||||||
printf("Launch kernel to generate assertion failures\n");
|
printf("Launch kernel to generate assertion failures\n");
|
||||||
testKernel<<<dimGrid, dimBlock>>>(60);
|
testKernel<<<dimGrid, dimBlock>>>(60);
|
||||||
|
|
||||||
// Synchronize (flushes assert output).
|
// Synchronize (flushes assert output).
|
||||||
printf("\n-- Begin assert output\n\n");
|
printf("\n-- Begin assert output\n\n");
|
||||||
error = cudaDeviceSynchronize();
|
error = cudaDeviceSynchronize();
|
||||||
printf("\n-- End assert output\n\n");
|
printf("\n-- End assert output\n\n");
|
||||||
|
|
||||||
// Check for errors and failed asserts in asynchronous kernel launch.
|
// Check for errors and failed asserts in asynchronous kernel launch.
|
||||||
if (error == cudaErrorAssert) {
|
if (error == cudaErrorAssert) {
|
||||||
printf(
|
printf("Device assert failed as expected, "
|
||||||
"Device assert failed as expected, "
|
"CUDA error message is: %s\n\n",
|
||||||
"CUDA error message is: %s\n\n",
|
cudaGetErrorString(error));
|
||||||
cudaGetErrorString(error));
|
}
|
||||||
}
|
|
||||||
|
|
||||||
testResult = error == cudaErrorAssert;
|
testResult = error == cudaErrorAssert;
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -30,7 +30,7 @@ cuModuleGetFunction, cuLaunchKernel, cuCtxSynchronize
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -34,15 +34,16 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Includes, system
|
// Includes, system
|
||||||
#include <stdio.h>
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// Includes CUDA
|
// Includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "nvrtc_helper.h"
|
#include "nvrtc_helper.h"
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||||
|
|
||||||
const char *sampleName = "simpleAssert_nvrtc";
|
const char *sampleName = "simpleAssert_nvrtc";
|
||||||
|
|
||||||
@ -58,56 +59,63 @@ void runTest(int argc, char **argv);
|
|||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("%s starting...\n", sampleName);
|
{
|
||||||
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
int Nblocks = 2;
|
{
|
||||||
int Nthreads = 32;
|
int Nblocks = 2;
|
||||||
|
int Nthreads = 32;
|
||||||
|
|
||||||
// Kernel configuration, where a one-dimensional
|
// Kernel configuration, where a one-dimensional
|
||||||
// grid and one-dimensional blocks are configured.
|
// grid and one-dimensional blocks are configured.
|
||||||
|
|
||||||
dim3 dimGrid(Nblocks);
|
dim3 dimGrid(Nblocks);
|
||||||
dim3 dimBlock(Nthreads);
|
dim3 dimBlock(Nthreads);
|
||||||
|
|
||||||
printf("Launch kernel to generate assertion failures\n");
|
printf("Launch kernel to generate assertion failures\n");
|
||||||
char *cubin, *kernel_file;
|
char *cubin, *kernel_file;
|
||||||
size_t cubinSize;
|
size_t cubinSize;
|
||||||
|
|
||||||
kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
|
kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
|
||||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
||||||
|
|
||||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||||
CUfunction kernel_addr;
|
CUfunction kernel_addr;
|
||||||
|
|
||||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
|
||||||
|
|
||||||
int count = 60;
|
int count = 60;
|
||||||
void *args[] = {(void *)&count};
|
void *args[] = {(void *)&count};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
|
dimGrid.x,
|
||||||
dimBlock.x, dimBlock.y, dimBlock.z, /* block dim */
|
dimGrid.y,
|
||||||
0, 0, /* shared mem, stream */
|
dimGrid.z, /* grid dim */
|
||||||
&args[0], /* arguments */
|
dimBlock.x,
|
||||||
0));
|
dimBlock.y,
|
||||||
|
dimBlock.z, /* block dim */
|
||||||
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
|
&args[0], /* arguments */
|
||||||
|
0));
|
||||||
|
|
||||||
// Synchronize (flushes assert output).
|
// Synchronize (flushes assert output).
|
||||||
printf("\n-- Begin assert output\n\n");
|
printf("\n-- Begin assert output\n\n");
|
||||||
CUresult res = cuCtxSynchronize();
|
CUresult res = cuCtxSynchronize();
|
||||||
|
|
||||||
printf("\n-- End assert output\n\n");
|
printf("\n-- End assert output\n\n");
|
||||||
|
|
||||||
// Check for errors and failed asserts in asynchronous kernel launch.
|
// Check for errors and failed asserts in asynchronous kernel launch.
|
||||||
if (res == CUDA_ERROR_ASSERT) {
|
if (res == CUDA_ERROR_ASSERT) {
|
||||||
printf("Device assert failed as expected\n");
|
printf("Device assert failed as expected\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
testResult = res == CUDA_ERROR_ASSERT;
|
testResult = res == CUDA_ERROR_ASSERT;
|
||||||
}
|
}
|
||||||
|
@ -32,7 +32,8 @@
|
|||||||
//! Thread whose id > N will print assertion failed error message.
|
//! Thread whose id > N will print assertion failed error message.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
extern "C" __global__ void testKernel(int N) {
|
extern "C" __global__ void testKernel(int N)
|
||||||
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
{
|
||||||
assert(gtid < N);
|
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
assert(gtid < N);
|
||||||
}
|
}
|
||||||
|
@ -11,8 +11,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
|
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaStreamCreateWithFlags, cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSyn
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -30,10 +30,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WINDOWS_LEAN_AND_MEAN
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
@ -45,10 +45,10 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||||
|
|
||||||
// CUDA helper functions
|
// CUDA helper functions
|
||||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||||
|
|
||||||
// Includes, kernels
|
// Includes, kernels
|
||||||
#include "simpleAtomicIntrinsics_kernel.cuh"
|
#include "simpleAtomicIntrinsics_kernel.cuh"
|
||||||
@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("%s starting...\n", sampleName);
|
{
|
||||||
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
cudaStream_t stream;
|
{
|
||||||
// This will pick the best possible CUDA capable device
|
cudaStream_t stream;
|
||||||
findCudaDevice(argc, (const char **)argv);
|
// This will pick the best possible CUDA capable device
|
||||||
|
findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
StopWatchInterface *timer;
|
StopWatchInterface *timer;
|
||||||
sdkCreateTimer(&timer);
|
sdkCreateTimer(&timer);
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
unsigned int numThreads = 256;
|
unsigned int numThreads = 256;
|
||||||
unsigned int numBlocks = 64;
|
unsigned int numBlocks = 64;
|
||||||
unsigned int numData = 11;
|
unsigned int numData = 11;
|
||||||
unsigned int memSize = sizeof(int) * numData;
|
unsigned int memSize = sizeof(int) * numData;
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
int *hOData;
|
int *hOData;
|
||||||
checkCudaErrors(cudaMallocHost(&hOData, memSize));
|
checkCudaErrors(cudaMallocHost(&hOData, memSize));
|
||||||
|
|
||||||
// initialize the memory
|
// initialize the memory
|
||||||
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
|
for (unsigned int i = 0; i < numData; i++)
|
||||||
|
hOData[i] = 0;
|
||||||
|
|
||||||
// To make the AND and XOR tests generate something other than 0...
|
// To make the AND and XOR tests generate something other than 0...
|
||||||
hOData[8] = hOData[10] = 0xff;
|
hOData[8] = hOData[10] = 0xff;
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
int *dOData;
|
int *dOData;
|
||||||
checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
|
checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
|
||||||
// copy host memory to device to initialize to zero
|
// copy host memory to device to initialize to zero
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
|
||||||
cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// execute the kernel
|
// execute the kernel
|
||||||
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
|
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
|
||||||
|
|
||||||
// Copy result from device to host
|
// Copy result from device to host
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
|
||||||
cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
|
||||||
|
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// Compute reference solution
|
// Compute reference solution
|
||||||
testResult = computeGold(hOData, numThreads * numBlocks);
|
testResult = computeGold(hOData, numThreads * numBlocks);
|
||||||
|
|
||||||
// Cleanup memory
|
// Cleanup memory
|
||||||
checkCudaErrors(cudaFreeHost(hOData));
|
checkCudaErrors(cudaFreeHost(hOData));
|
||||||
checkCudaErrors(cudaFree(dOData));
|
checkCudaErrors(cudaFree(dOData));
|
||||||
}
|
}
|
||||||
|
@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len);
|
|||||||
//! @param idata input data as provided to device
|
//! @param idata input data as provided to device
|
||||||
//! @param len number of elements in reference / idata
|
//! @param len number of elements in reference / idata
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int computeGold(int *gpuData, const int len) {
|
int computeGold(int *gpuData, const int len)
|
||||||
int val = 0;
|
{
|
||||||
|
int val = 0;
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
val += 10;
|
val += 10;
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[0]) {
|
|
||||||
printf("atomicAdd failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val -= 10;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[1]) {
|
|
||||||
printf("atomicSub failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool found = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// third element should be a member of [0, len)
|
|
||||||
if (i == gpuData[2]) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
if (val != gpuData[0]) {
|
||||||
printf("atomicExch failed\n");
|
printf("atomicAdd failed\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
|
|
||||||
val = -(1 << 8);
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// fourth element should be len-1
|
|
||||||
val = max(val, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[3]) {
|
|
||||||
printf("atomicMax failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 1 << 8;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val = min(val, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[4]) {
|
|
||||||
printf("atomicMin failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int limit = 17;
|
|
||||||
val = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val = (val >= limit) ? 0 : val + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[5]) {
|
|
||||||
printf("atomicInc failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
limit = 137;
|
|
||||||
val = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val = ((val == 0) || (val > limit)) ? limit : val - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[6]) {
|
|
||||||
printf("atomicDec failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
found = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// eighth element should be a member of [0, len)
|
|
||||||
if (i == gpuData[7]) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
val = 0;
|
||||||
printf("atomicCAS failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 0xff;
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val -= 10;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
if (val != gpuData[1]) {
|
||||||
// 9th element should be 1
|
printf("atomicSub failed\n");
|
||||||
val &= (2 * i + 7);
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (val != gpuData[8]) {
|
bool found = false;
|
||||||
printf("atomicAnd failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 0;
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// third element should be a member of [0, len)
|
||||||
|
if (i == gpuData[2]) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
if (!found) {
|
||||||
// 10th element should be 0xff
|
printf("atomicExch failed\n");
|
||||||
val |= (1 << i);
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (val != gpuData[9]) {
|
val = -(1 << 8);
|
||||||
printf("atomicOr failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 0xff;
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// fourth element should be len-1
|
||||||
|
val = max(val, i);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
if (val != gpuData[3]) {
|
||||||
// 11th element should be 0xff
|
printf("atomicMax failed\n");
|
||||||
val ^= i;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (val != gpuData[10]) {
|
val = 1 << 8;
|
||||||
printf("atomicXor failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val = min(val, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[4]) {
|
||||||
|
printf("atomicMin failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int limit = 17;
|
||||||
|
val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val = (val >= limit) ? 0 : val + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[5]) {
|
||||||
|
printf("atomicInc failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
limit = 137;
|
||||||
|
val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val = ((val == 0) || (val > limit)) ? limit : val - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[6]) {
|
||||||
|
printf("atomicDec failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
found = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// eighth element should be a member of [0, len)
|
||||||
|
if (i == gpuData[7]) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
printf("atomicCAS failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 0xff;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// 9th element should be 1
|
||||||
|
val &= (2 * i + 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[8]) {
|
||||||
|
printf("atomicAnd failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// 10th element should be 0xff
|
||||||
|
val |= (1 << i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[9]) {
|
||||||
|
printf("atomicOr failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 0xff;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// 11th element should be 0xff
|
||||||
|
val ^= i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[10]) {
|
||||||
|
printf("atomicXor failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -35,48 +35,49 @@
|
|||||||
//! @param g_idata input data in global memory
|
//! @param g_idata input data in global memory
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void testKernel(int *g_odata) {
|
__global__ void testKernel(int *g_odata)
|
||||||
// access thread id
|
{
|
||||||
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
// access thread id
|
||||||
|
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
// Test various atomic instructions
|
// Test various atomic instructions
|
||||||
|
|
||||||
// Arithmetic atomic instructions
|
// Arithmetic atomic instructions
|
||||||
|
|
||||||
// Atomic addition
|
// Atomic addition
|
||||||
atomicAdd(&g_odata[0], 10);
|
atomicAdd(&g_odata[0], 10);
|
||||||
|
|
||||||
// Atomic subtraction (final should be 0)
|
// Atomic subtraction (final should be 0)
|
||||||
atomicSub(&g_odata[1], 10);
|
atomicSub(&g_odata[1], 10);
|
||||||
|
|
||||||
// Atomic exchange
|
// Atomic exchange
|
||||||
atomicExch(&g_odata[2], tid);
|
atomicExch(&g_odata[2], tid);
|
||||||
|
|
||||||
// Atomic maximum
|
// Atomic maximum
|
||||||
atomicMax(&g_odata[3], tid);
|
atomicMax(&g_odata[3], tid);
|
||||||
|
|
||||||
// Atomic minimum
|
// Atomic minimum
|
||||||
atomicMin(&g_odata[4], tid);
|
atomicMin(&g_odata[4], tid);
|
||||||
|
|
||||||
// Atomic increment (modulo 17+1)
|
// Atomic increment (modulo 17+1)
|
||||||
atomicInc((unsigned int *)&g_odata[5], 17);
|
atomicInc((unsigned int *)&g_odata[5], 17);
|
||||||
|
|
||||||
// Atomic decrement
|
// Atomic decrement
|
||||||
atomicDec((unsigned int *)&g_odata[6], 137);
|
atomicDec((unsigned int *)&g_odata[6], 137);
|
||||||
|
|
||||||
// Atomic compare-and-swap
|
// Atomic compare-and-swap
|
||||||
atomicCAS(&g_odata[7], tid - 1, tid);
|
atomicCAS(&g_odata[7], tid - 1, tid);
|
||||||
|
|
||||||
// Bitwise atomic instructions
|
// Bitwise atomic instructions
|
||||||
|
|
||||||
// Atomic AND
|
// Atomic AND
|
||||||
atomicAnd(&g_odata[8], 2 * tid + 7);
|
atomicAnd(&g_odata[8], 2 * tid + 7);
|
||||||
|
|
||||||
// Atomic OR
|
// Atomic OR
|
||||||
atomicOr(&g_odata[9], 1 << tid);
|
atomicOr(&g_odata[9], 1 << tid);
|
||||||
|
|
||||||
// Atomic XOR
|
// Atomic XOR
|
||||||
atomicXor(&g_odata[10], tid);
|
atomicXor(&g_odata[10], tid);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
|
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -33,7 +33,7 @@ cudaBlockSize, cudaGridSize
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -30,10 +30,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WINDOWS_LEAN_AND_MEAN
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
@ -46,7 +46,7 @@
|
|||||||
#include <nvrtc_helper.h>
|
#include <nvrtc_helper.h>
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||||
|
|
||||||
const char *sampleName = "simpleAtomicIntrinsics_nvrtc";
|
const char *sampleName = "simpleAtomicIntrinsics_nvrtc";
|
||||||
|
|
||||||
@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len);
|
|||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
printf("%s starting...\n", sampleName);
|
{
|
||||||
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
int dev = 0;
|
{
|
||||||
|
int dev = 0;
|
||||||
|
|
||||||
char *cubin, *kernel_file;
|
char *cubin, *kernel_file;
|
||||||
size_t cubinSize;
|
size_t cubinSize;
|
||||||
|
|
||||||
kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
|
kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
|
||||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
||||||
|
|
||||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||||
CUfunction kernel_addr;
|
CUfunction kernel_addr;
|
||||||
|
|
||||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
|
||||||
|
|
||||||
StopWatchInterface *timer;
|
StopWatchInterface *timer;
|
||||||
sdkCreateTimer(&timer);
|
sdkCreateTimer(&timer);
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
unsigned int numThreads = 256;
|
unsigned int numThreads = 256;
|
||||||
unsigned int numBlocks = 64;
|
unsigned int numBlocks = 64;
|
||||||
unsigned int numData = 11;
|
unsigned int numData = 11;
|
||||||
unsigned int memSize = sizeof(int) * numData;
|
unsigned int memSize = sizeof(int) * numData;
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
int *hOData = (int *)malloc(memSize);
|
int *hOData = (int *)malloc(memSize);
|
||||||
|
|
||||||
// initialize the memory
|
// initialize the memory
|
||||||
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
|
for (unsigned int i = 0; i < numData; i++)
|
||||||
|
hOData[i] = 0;
|
||||||
|
|
||||||
// To make the AND and XOR tests generate something other than 0...
|
// To make the AND and XOR tests generate something other than 0...
|
||||||
hOData[8] = hOData[10] = 0xff;
|
hOData[8] = hOData[10] = 0xff;
|
||||||
|
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
CUdeviceptr dOData;
|
CUdeviceptr dOData;
|
||||||
checkCudaErrors(cuMemAlloc(&dOData, memSize));
|
checkCudaErrors(cuMemAlloc(&dOData, memSize));
|
||||||
checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
|
checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
|
||||||
|
|
||||||
// execute the kernel
|
// execute the kernel
|
||||||
dim3 cudaBlockSize(numThreads, 1, 1);
|
dim3 cudaBlockSize(numThreads, 1, 1);
|
||||||
dim3 cudaGridSize(numBlocks, 1, 1);
|
dim3 cudaGridSize(numBlocks, 1, 1);
|
||||||
|
|
||||||
void *arr[] = {(void *)&dOData};
|
void *arr[] = {(void *)&dOData};
|
||||||
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
cudaGridSize.z, /* grid dim */
|
cudaGridSize.x,
|
||||||
cudaBlockSize.x, cudaBlockSize.y,
|
cudaGridSize.y,
|
||||||
cudaBlockSize.z, /* block dim */
|
cudaGridSize.z, /* grid dim */
|
||||||
0, 0, /* shared mem, stream */
|
cudaBlockSize.x,
|
||||||
&arr[0], /* arguments */
|
cudaBlockSize.y,
|
||||||
0));
|
cudaBlockSize.z, /* block dim */
|
||||||
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
|
&arr[0], /* arguments */
|
||||||
|
0));
|
||||||
|
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
|
|
||||||
checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
|
checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
|
||||||
|
|
||||||
// Copy result from device to host
|
// Copy result from device to host
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// Compute reference solution
|
// Compute reference solution
|
||||||
testResult = computeGold(hOData, numThreads * numBlocks);
|
testResult = computeGold(hOData, numThreads * numBlocks);
|
||||||
|
|
||||||
// Cleanup memory
|
// Cleanup memory
|
||||||
free(hOData);
|
free(hOData);
|
||||||
checkCudaErrors(cuMemFree(dOData));
|
checkCudaErrors(cuMemFree(dOData));
|
||||||
}
|
}
|
||||||
|
@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len);
|
|||||||
//! @param len number of elements in reference / idata
|
//! @param len number of elements in reference / idata
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int computeGold(int *gpuData, const int len) {
|
int computeGold(int *gpuData, const int len)
|
||||||
int val = 0;
|
{
|
||||||
|
int val = 0;
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
val += 10;
|
val += 10;
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[0]) {
|
|
||||||
printf("atomicAdd failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val -= 10;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[1]) {
|
|
||||||
printf("atomicSub failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool found = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// third element should be a member of [0, len)
|
|
||||||
if (i == gpuData[2]) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
if (val != gpuData[0]) {
|
||||||
printf("atomicExch failed\n");
|
printf("atomicAdd failed\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
|
||||||
|
|
||||||
val = -(1 << 8);
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// fourth element should be len-1
|
|
||||||
val = max(val, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[3]) {
|
|
||||||
printf("atomicMax failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 1 << 8;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val = min(val, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[4]) {
|
|
||||||
printf("atomicMin failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int limit = 17;
|
|
||||||
val = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val = (val >= limit) ? 0 : val + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[5]) {
|
|
||||||
printf("atomicInc failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
limit = 137;
|
|
||||||
val = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
val = ((val == 0) || (val > limit)) ? limit : val - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[6]) {
|
|
||||||
printf("atomicDec failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
found = false;
|
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// eighth element should be a member of [0, len)
|
|
||||||
if (i == gpuData[7]) {
|
|
||||||
found = true;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if (!found) {
|
val = 0;
|
||||||
printf("atomicCAS failed\n");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
val = 0xff;
|
for (int i = 0; i < len; ++i) {
|
||||||
for (int i = 0; i < len; ++i) {
|
val -= 10;
|
||||||
// 9th element should be 1
|
}
|
||||||
val &= (2 * i + 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[8]) {
|
if (val != gpuData[1]) {
|
||||||
printf("atomicAnd failed\n");
|
printf("atomicSub failed\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
val = 0;
|
bool found = false;
|
||||||
for (int i = 0; i < len; ++i) {
|
|
||||||
// 10th element should be 0xff
|
|
||||||
val |= (1 << i);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[9]) {
|
for (int i = 0; i < len; ++i) {
|
||||||
printf("atomicOr failed\n");
|
// third element should be a member of [0, len)
|
||||||
return false;
|
if (i == gpuData[2]) {
|
||||||
}
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
val = 0xff;
|
if (!found) {
|
||||||
|
printf("atomicExch failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
val = -(1 << 8);
|
||||||
// 11th element should be 0xff
|
|
||||||
val ^= i;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (val != gpuData[10]) {
|
for (int i = 0; i < len; ++i) {
|
||||||
printf("atomicXor failed\n");
|
// fourth element should be len-1
|
||||||
return false;
|
val = max(val, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
if (val != gpuData[3]) {
|
||||||
|
printf("atomicMax failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 1 << 8;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val = min(val, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[4]) {
|
||||||
|
printf("atomicMin failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int limit = 17;
|
||||||
|
val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val = (val >= limit) ? 0 : val + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[5]) {
|
||||||
|
printf("atomicInc failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
limit = 137;
|
||||||
|
val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
val = ((val == 0) || (val > limit)) ? limit : val - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[6]) {
|
||||||
|
printf("atomicDec failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
found = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// eighth element should be a member of [0, len)
|
||||||
|
if (i == gpuData[7]) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
printf("atomicCAS failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 0xff;
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// 9th element should be 1
|
||||||
|
val &= (2 * i + 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[8]) {
|
||||||
|
printf("atomicAnd failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 0;
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// 10th element should be 0xff
|
||||||
|
val |= (1 << i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[9]) {
|
||||||
|
printf("atomicOr failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = 0xff;
|
||||||
|
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
// 11th element should be 0xff
|
||||||
|
val ^= i;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (val != gpuData[10]) {
|
||||||
|
printf("atomicXor failed\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -36,45 +36,46 @@
|
|||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
extern "C" __global__ void testKernel(int *g_odata) {
|
extern "C" __global__ void testKernel(int *g_odata)
|
||||||
// access thread id
|
{
|
||||||
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
// access thread id
|
||||||
|
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
// Test various atomic instructions
|
// Test various atomic instructions
|
||||||
// Arithmetic atomic instructions
|
// Arithmetic atomic instructions
|
||||||
// Atomic addition
|
// Atomic addition
|
||||||
atomicAdd(&g_odata[0], 10);
|
atomicAdd(&g_odata[0], 10);
|
||||||
|
|
||||||
// Atomic subtraction (final should be 0)
|
// Atomic subtraction (final should be 0)
|
||||||
atomicSub(&g_odata[1], 10);
|
atomicSub(&g_odata[1], 10);
|
||||||
|
|
||||||
// Atomic exchange
|
// Atomic exchange
|
||||||
atomicExch(&g_odata[2], tid);
|
atomicExch(&g_odata[2], tid);
|
||||||
|
|
||||||
// Atomic maximum
|
// Atomic maximum
|
||||||
atomicMax(&g_odata[3], tid);
|
atomicMax(&g_odata[3], tid);
|
||||||
|
|
||||||
// Atomic minimum
|
// Atomic minimum
|
||||||
atomicMin(&g_odata[4], tid);
|
atomicMin(&g_odata[4], tid);
|
||||||
|
|
||||||
// Atomic increment (modulo 17+1)
|
// Atomic increment (modulo 17+1)
|
||||||
atomicInc((unsigned int *)&g_odata[5], 17);
|
atomicInc((unsigned int *)&g_odata[5], 17);
|
||||||
|
|
||||||
// Atomic decrement
|
// Atomic decrement
|
||||||
atomicDec((unsigned int *)&g_odata[6], 137);
|
atomicDec((unsigned int *)&g_odata[6], 137);
|
||||||
|
|
||||||
// Atomic compare-and-swap
|
// Atomic compare-and-swap
|
||||||
atomicCAS(&g_odata[7], tid - 1, tid);
|
atomicCAS(&g_odata[7], tid - 1, tid);
|
||||||
|
|
||||||
// Bitwise atomic instructions
|
// Bitwise atomic instructions
|
||||||
// Atomic AND
|
// Atomic AND
|
||||||
atomicAnd(&g_odata[8], 2 * tid + 7);
|
atomicAnd(&g_odata[8], 2 * tid + 7);
|
||||||
|
|
||||||
// Atomic OR
|
// Atomic OR
|
||||||
atomicOr(&g_odata[9], 1 << tid);
|
atomicOr(&g_odata[9], 1 << tid);
|
||||||
|
|
||||||
// Atomic XOR
|
// Atomic XOR
|
||||||
atomicXor(&g_odata[10], tid);
|
atomicXor(&g_odata[10], tid);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
|
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaFree, cudaMallocHost, cudaFreeHost, cudaStreamSynchronize, cudaStreamSetAttr
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -26,30 +26,31 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes CUDA
|
// includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <helper_functions.h> // helper functions for SDK examples
|
#include <helper_functions.h> // helper functions for SDK examples
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// declaration, forward
|
// declaration, forward
|
||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
|
|
||||||
cudaAccessPolicyWindow initAccessPolicyWindow(void) {
|
cudaAccessPolicyWindow initAccessPolicyWindow(void)
|
||||||
cudaAccessPolicyWindow accessPolicyWindow = {0};
|
{
|
||||||
accessPolicyWindow.base_ptr = (void *)0;
|
cudaAccessPolicyWindow accessPolicyWindow = {0};
|
||||||
accessPolicyWindow.num_bytes = 0;
|
accessPolicyWindow.base_ptr = (void *)0;
|
||||||
accessPolicyWindow.hitRatio = 0.f;
|
accessPolicyWindow.num_bytes = 0;
|
||||||
accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
|
accessPolicyWindow.hitRatio = 0.f;
|
||||||
accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
|
accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
|
||||||
return accessPolicyWindow;
|
accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
|
||||||
|
return accessPolicyWindow;
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
|
|||||||
//! @param bigDataSize input bigData size
|
//! @param bigDataSize input bigData size
|
||||||
//! @param hitcount how many data access are done within block
|
//! @param hitcount how many data access are done within block
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
|
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
|
||||||
int bigDataSize, int hitCount) {
|
{
|
||||||
__shared__ unsigned int hit;
|
__shared__ unsigned int hit;
|
||||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int tID = row * blockDim.y + col;
|
int tID = row * blockDim.y + col;
|
||||||
uint32_t psRand = tID;
|
uint32_t psRand = tID;
|
||||||
|
|
||||||
atomicExch(&hit, 0);
|
atomicExch(&hit, 0);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
while (hit < hitCount) {
|
while (hit < hitCount) {
|
||||||
psRand ^= psRand << 13;
|
psRand ^= psRand << 13;
|
||||||
psRand ^= psRand >> 17;
|
psRand ^= psRand >> 17;
|
||||||
psRand ^= psRand << 5;
|
psRand ^= psRand << 5;
|
||||||
|
|
||||||
int idx = tID - psRand;
|
int idx = tID - psRand;
|
||||||
if (idx < 0) {
|
if (idx < 0) {
|
||||||
idx = -idx;
|
idx = -idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((tID % 2) == 0) {
|
||||||
|
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
|
||||||
|
}
|
||||||
|
|
||||||
|
atomicAdd(&hit, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((tID % 2) == 0) {
|
|
||||||
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
|
|
||||||
} else {
|
|
||||||
trash[psRand % bigDataSize] =
|
|
||||||
trash[psRand % bigDataSize] + trash[idx % bigDataSize];
|
|
||||||
}
|
|
||||||
|
|
||||||
atomicAdd(&hit, 1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); }
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
bool bTestResult = true;
|
{
|
||||||
cudaAccessPolicyWindow accessPolicyWindow;
|
bool bTestResult = true;
|
||||||
cudaDeviceProp deviceProp;
|
cudaAccessPolicyWindow accessPolicyWindow;
|
||||||
cudaStreamAttrValue streamAttrValue;
|
cudaDeviceProp deviceProp;
|
||||||
cudaStream_t stream;
|
cudaStreamAttrValue streamAttrValue;
|
||||||
cudaStreamAttrID streamAttrID;
|
cudaStream_t stream;
|
||||||
dim3 threads(32, 32);
|
cudaStreamAttrID streamAttrID;
|
||||||
int *dataDevicePointer;
|
dim3 threads(32, 32);
|
||||||
int *dataHostPointer;
|
int *dataDevicePointer;
|
||||||
int dataSize;
|
int *dataHostPointer;
|
||||||
int *bigDataDevicePointer;
|
int dataSize;
|
||||||
int *bigDataHostPointer;
|
int *bigDataDevicePointer;
|
||||||
int bigDataSize;
|
int *bigDataHostPointer;
|
||||||
StopWatchInterface *timer = 0;
|
int bigDataSize;
|
||||||
|
StopWatchInterface *timer = 0;
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
|
|
||||||
// use command-line specified CUDA device, otherwise use device with highest
|
// use command-line specified CUDA device, otherwise use device with highest
|
||||||
// Gflops/s
|
// Gflops/s
|
||||||
int devID = findCudaDevice(argc, (const char **)argv);
|
int devID = findCudaDevice(argc, (const char **)argv);
|
||||||
sdkCreateTimer(&timer);
|
sdkCreateTimer(&timer);
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
// Get device properties
|
// Get device properties
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
||||||
dim3 blocks(deviceProp.maxGridSize[1], 1);
|
dim3 blocks(deviceProp.maxGridSize[1], 1);
|
||||||
|
|
||||||
// Make sure device the l2 optimization
|
// Make sure device the l2 optimization
|
||||||
if (deviceProp.persistingL2CacheMaxSize == 0) {
|
if (deviceProp.persistingL2CacheMaxSize == 0) {
|
||||||
printf(
|
printf("Waiving execution as device %d does not support persisting L2 "
|
||||||
"Waiving execution as device %d does not support persisting L2 "
|
"Caching\n",
|
||||||
"Caching\n",
|
devID);
|
||||||
devID);
|
exit(EXIT_WAIVED);
|
||||||
exit(EXIT_WAIVED);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Create stream to assiocate with window
|
|
||||||
checkCudaErrors(cudaStreamCreate(&stream));
|
|
||||||
|
|
||||||
// Set the amount of l2 cache that will be persisting to maximum the device
|
|
||||||
// can support
|
|
||||||
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
|
|
||||||
deviceProp.persistingL2CacheMaxSize));
|
|
||||||
|
|
||||||
// Stream attribute to set
|
|
||||||
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
|
|
||||||
|
|
||||||
// Default window
|
|
||||||
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
|
|
||||||
accessPolicyWindow = initAccessPolicyWindow();
|
|
||||||
|
|
||||||
// Allocate size of both buffers
|
|
||||||
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
|
|
||||||
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
|
|
||||||
|
|
||||||
// Allocate data
|
|
||||||
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
|
|
||||||
|
|
||||||
for (int i = 0; i < bigDataSize; ++i) {
|
|
||||||
if (i < dataSize) {
|
|
||||||
dataHostPointer[i] = i;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bigDataHostPointer[bigDataSize - i - 1] = i;
|
// Create stream to assiocate with window
|
||||||
}
|
checkCudaErrors(cudaStreamCreate(&stream));
|
||||||
|
|
||||||
checkCudaErrors(
|
// Set the amount of l2 cache that will be persisting to maximum the device
|
||||||
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
|
// can support
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
|
||||||
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
|
|
||||||
dataSize * sizeof(int),
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
|
|
||||||
bigDataSize * sizeof(int),
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// Make a window for the buffer of interest
|
// Stream attribute to set
|
||||||
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
|
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
|
||||||
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
|
|
||||||
accessPolicyWindow.hitRatio = 1.f;
|
|
||||||
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
|
|
||||||
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
|
|
||||||
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
|
|
||||||
|
|
||||||
// Assign window to stream
|
// Default window
|
||||||
checkCudaErrors(
|
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
|
||||||
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
|
accessPolicyWindow = initAccessPolicyWindow();
|
||||||
|
|
||||||
// Demote any previous persisting lines
|
// Allocate size of both buffers
|
||||||
checkCudaErrors(cudaCtxResetPersistingL2Cache());
|
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
|
||||||
|
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
// Allocate data
|
||||||
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
|
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
|
||||||
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
|
checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
for (int i = 0; i < bigDataSize; ++i) {
|
||||||
// check if kernel execution generated and error
|
if (i < dataSize) {
|
||||||
getLastCudaError("Kernel execution failed");
|
dataHostPointer[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
// Free memory
|
bigDataHostPointer[bigDataSize - i - 1] = i;
|
||||||
checkCudaErrors(cudaFreeHost(dataHostPointer));
|
}
|
||||||
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
|
|
||||||
checkCudaErrors(cudaFree(dataDevicePointer));
|
|
||||||
checkCudaErrors(cudaFree(bigDataDevicePointer));
|
|
||||||
|
|
||||||
sdkStopTimer(&timer);
|
checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
|
||||||
sdkDeleteTimer(&timer);
|
checkCudaErrors(
|
||||||
|
cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(
|
||||||
|
bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
|
||||||
|
|
||||||
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
// Make a window for the buffer of interest
|
||||||
|
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
|
||||||
|
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
|
||||||
|
accessPolicyWindow.hitRatio = 1.f;
|
||||||
|
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
|
||||||
|
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
|
||||||
|
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
|
||||||
|
|
||||||
|
// Assign window to stream
|
||||||
|
checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
|
||||||
|
|
||||||
|
// Demote any previous persisting lines
|
||||||
|
checkCudaErrors(cudaCtxResetPersistingL2Cache());
|
||||||
|
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
|
||||||
|
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
|
||||||
|
|
||||||
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
// check if kernel execution generated and error
|
||||||
|
getLastCudaError("Kernel execution failed");
|
||||||
|
|
||||||
|
// Free memory
|
||||||
|
checkCudaErrors(cudaFreeHost(dataHostPointer));
|
||||||
|
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
|
||||||
|
checkCudaErrors(cudaFree(dataDevicePointer));
|
||||||
|
checkCudaErrors(cudaFree(bigDataDevicePointer));
|
||||||
|
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
|
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
@ -59,12 +61,16 @@ if(${OpenGL_FOUND})
|
|||||||
|
|
||||||
add_custom_command(TARGET simpleCUDA2GL
|
add_custom_command(TARGET simpleCUDA2GL
|
||||||
POST_BUILD
|
POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll ${CMAKE_CURRENT_BINARY_DIR}
|
COMMAND ${CMAKE_COMMAND} -E
|
||||||
|
copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/freeglut.dll
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||||
)
|
)
|
||||||
|
|
||||||
add_custom_command(TARGET simpleCUDA2GL
|
add_custom_command(TARGET simpleCUDA2GL
|
||||||
POST_BUILD
|
POST_BUILD
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll ${CMAKE_CURRENT_BINARY_DIR}
|
COMMAND ${CMAKE_COMMAND} -E
|
||||||
|
copy ${CMAKE_CURRENT_SOURCE_DIR}/../../../bin/win64/$<CONFIGURATION>/glew64.dll
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/$<CONFIGURATION>
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -30,8 +30,7 @@ cudaHostAlloc, cudaGraphicsUnmapResources, cudaMalloc, cudaFree, cudaGraphicsRes
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
|
|||||||
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
|
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
|
||||||
|
|
||||||
// convert floating point rgb color to 8-bit integer
|
// convert floating point rgb color to 8-bit integer
|
||||||
__device__ int rgbToInt(float r, float g, float b) {
|
__device__ int rgbToInt(float r, float g, float b)
|
||||||
r = clamp(r, 0.0f, 255.0f);
|
{
|
||||||
g = clamp(g, 0.0f, 255.0f);
|
r = clamp(r, 0.0f, 255.0f);
|
||||||
b = clamp(b, 0.0f, 255.0f);
|
g = clamp(g, 0.0f, 255.0f);
|
||||||
return (int(b) << 16) | (int(g) << 8) | int(r);
|
b = clamp(b, 0.0f, 255.0f);
|
||||||
|
return (int(b) << 16) | (int(g) << 8) | int(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
|
__global__ void cudaProcess(unsigned int *g_odata, int imgw)
|
||||||
extern __shared__ uchar4 sdata[];
|
{
|
||||||
|
extern __shared__ uchar4 sdata[];
|
||||||
|
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
int ty = threadIdx.y;
|
int ty = threadIdx.y;
|
||||||
int bw = blockDim.x;
|
int bw = blockDim.x;
|
||||||
int bh = blockDim.y;
|
int bh = blockDim.y;
|
||||||
int x = blockIdx.x * bw + tx;
|
int x = blockIdx.x * bw + tx;
|
||||||
int y = blockIdx.y * bh + ty;
|
int y = blockIdx.y * bh + ty;
|
||||||
|
|
||||||
uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
|
uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
|
||||||
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
|
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
|
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
|
||||||
unsigned int *g_odata, int imgw) {
|
{
|
||||||
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
|
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaHostAlloc, cudaStreamDestroy, cudaFree, cudaSetDevice, cudaGetDeviceCount, c
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -29,115 +29,124 @@
|
|||||||
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
// Create thread
|
// Create thread
|
||||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
|
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
|
||||||
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
|
{
|
||||||
|
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for thread to finish
|
// Wait for thread to finish
|
||||||
void cutEndThread(CUTThread thread) {
|
void cutEndThread(CUTThread thread)
|
||||||
WaitForSingleObject(thread, INFINITE);
|
{
|
||||||
CloseHandle(thread);
|
WaitForSingleObject(thread, INFINITE);
|
||||||
|
CloseHandle(thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for multiple threads
|
// Wait for multiple threads
|
||||||
void cutWaitForThreads(const CUTThread *threads, int num) {
|
void cutWaitForThreads(const CUTThread *threads, int num)
|
||||||
WaitForMultipleObjects(num, threads, true, INFINITE);
|
{
|
||||||
|
WaitForMultipleObjects(num, threads, true, INFINITE);
|
||||||
|
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
CloseHandle(threads[i]);
|
CloseHandle(threads[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create barrier.
|
// Create barrier.
|
||||||
CUTBarrier cutCreateBarrier(int releaseCount) {
|
CUTBarrier cutCreateBarrier(int releaseCount)
|
||||||
CUTBarrier barrier;
|
{
|
||||||
|
CUTBarrier barrier;
|
||||||
|
|
||||||
InitializeCriticalSection(&barrier.criticalSection);
|
InitializeCriticalSection(&barrier.criticalSection);
|
||||||
barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
|
barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
|
||||||
barrier.count = 0;
|
barrier.count = 0;
|
||||||
barrier.releaseCount = releaseCount;
|
barrier.releaseCount = releaseCount;
|
||||||
|
|
||||||
return barrier;
|
return barrier;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increment barrier. (execution continues)
|
// Increment barrier. (execution continues)
|
||||||
void cutIncrementBarrier(CUTBarrier *barrier) {
|
void cutIncrementBarrier(CUTBarrier *barrier)
|
||||||
int myBarrierCount;
|
{
|
||||||
EnterCriticalSection(&barrier->criticalSection);
|
int myBarrierCount;
|
||||||
myBarrierCount = ++barrier->count;
|
EnterCriticalSection(&barrier->criticalSection);
|
||||||
LeaveCriticalSection(&barrier->criticalSection);
|
myBarrierCount = ++barrier->count;
|
||||||
|
LeaveCriticalSection(&barrier->criticalSection);
|
||||||
|
|
||||||
if (myBarrierCount >= barrier->releaseCount) {
|
if (myBarrierCount >= barrier->releaseCount) {
|
||||||
SetEvent(barrier->barrierEvent);
|
SetEvent(barrier->barrierEvent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for barrier release.
|
// Wait for barrier release.
|
||||||
void cutWaitForBarrier(CUTBarrier *barrier) {
|
void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
|
||||||
WaitForSingleObject(barrier->barrierEvent, INFINITE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Destroy barrier
|
// Destroy barrier
|
||||||
void cutDestroyBarrier(CUTBarrier *barrier) {}
|
void cutDestroyBarrier(CUTBarrier *barrier) {}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// Create thread
|
// Create thread
|
||||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
|
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
|
||||||
pthread_t thread;
|
{
|
||||||
pthread_create(&thread, NULL, func, data);
|
pthread_t thread;
|
||||||
return thread;
|
pthread_create(&thread, NULL, func, data);
|
||||||
|
return thread;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for thread to finish
|
// Wait for thread to finish
|
||||||
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
|
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
|
||||||
|
|
||||||
// Wait for multiple threads
|
// Wait for multiple threads
|
||||||
void cutWaitForThreads(const CUTThread *threads, int num) {
|
void cutWaitForThreads(const CUTThread *threads, int num)
|
||||||
for (int i = 0; i < num; i++) {
|
{
|
||||||
cutEndThread(threads[i]);
|
for (int i = 0; i < num; i++) {
|
||||||
}
|
cutEndThread(threads[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create barrier.
|
// Create barrier.
|
||||||
CUTBarrier cutCreateBarrier(int releaseCount) {
|
CUTBarrier cutCreateBarrier(int releaseCount)
|
||||||
CUTBarrier barrier;
|
{
|
||||||
|
CUTBarrier barrier;
|
||||||
|
|
||||||
barrier.count = 0;
|
barrier.count = 0;
|
||||||
barrier.releaseCount = releaseCount;
|
barrier.releaseCount = releaseCount;
|
||||||
|
|
||||||
pthread_mutex_init(&barrier.mutex, 0);
|
pthread_mutex_init(&barrier.mutex, 0);
|
||||||
pthread_cond_init(&barrier.conditionVariable, 0);
|
pthread_cond_init(&barrier.conditionVariable, 0);
|
||||||
|
|
||||||
return barrier;
|
return barrier;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increment barrier. (execution continues)
|
// Increment barrier. (execution continues)
|
||||||
void cutIncrementBarrier(CUTBarrier *barrier) {
|
void cutIncrementBarrier(CUTBarrier *barrier)
|
||||||
int myBarrierCount;
|
{
|
||||||
pthread_mutex_lock(&barrier->mutex);
|
int myBarrierCount;
|
||||||
myBarrierCount = ++barrier->count;
|
pthread_mutex_lock(&barrier->mutex);
|
||||||
pthread_mutex_unlock(&barrier->mutex);
|
myBarrierCount = ++barrier->count;
|
||||||
|
pthread_mutex_unlock(&barrier->mutex);
|
||||||
|
|
||||||
if (myBarrierCount >= barrier->releaseCount) {
|
if (myBarrierCount >= barrier->releaseCount) {
|
||||||
pthread_cond_signal(&barrier->conditionVariable);
|
pthread_cond_signal(&barrier->conditionVariable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for barrier release.
|
// Wait for barrier release.
|
||||||
void cutWaitForBarrier(CUTBarrier *barrier) {
|
void cutWaitForBarrier(CUTBarrier *barrier)
|
||||||
pthread_mutex_lock(&barrier->mutex);
|
{
|
||||||
|
pthread_mutex_lock(&barrier->mutex);
|
||||||
|
|
||||||
while (barrier->count < barrier->releaseCount) {
|
while (barrier->count < barrier->releaseCount) {
|
||||||
pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
|
pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_mutex_unlock(&barrier->mutex);
|
pthread_mutex_unlock(&barrier->mutex);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Destroy barrier
|
// Destroy barrier
|
||||||
void cutDestroyBarrier(CUTBarrier *barrier) {
|
void cutDestroyBarrier(CUTBarrier *barrier)
|
||||||
pthread_mutex_destroy(&barrier->mutex);
|
{
|
||||||
pthread_cond_destroy(&barrier->conditionVariable);
|
pthread_mutex_destroy(&barrier->mutex);
|
||||||
|
pthread_cond_destroy(&barrier->conditionVariable);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -37,15 +37,16 @@
|
|||||||
typedef HANDLE CUTThread;
|
typedef HANDLE CUTThread;
|
||||||
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
|
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
|
||||||
|
|
||||||
struct CUTBarrier {
|
struct CUTBarrier
|
||||||
CRITICAL_SECTION criticalSection;
|
{
|
||||||
HANDLE barrierEvent;
|
CRITICAL_SECTION criticalSection;
|
||||||
int releaseCount;
|
HANDLE barrierEvent;
|
||||||
int count;
|
int releaseCount;
|
||||||
|
int count;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define CUT_THREADPROC unsigned WINAPI
|
#define CUT_THREADPROC unsigned WINAPI
|
||||||
#define CUT_THREADEND return 0
|
#define CUT_THREADEND return 0
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// POSIX threads.
|
// POSIX threads.
|
||||||
@ -55,44 +56,46 @@ typedef pthread_t CUTThread;
|
|||||||
typedef void *(*CUT_THREADROUTINE)(void *);
|
typedef void *(*CUT_THREADROUTINE)(void *);
|
||||||
|
|
||||||
#define CUT_THREADPROC void *
|
#define CUT_THREADPROC void *
|
||||||
#define CUT_THREADEND return 0
|
#define CUT_THREADEND return 0
|
||||||
|
|
||||||
struct CUTBarrier {
|
struct CUTBarrier
|
||||||
pthread_mutex_t mutex;
|
{
|
||||||
pthread_cond_t conditionVariable;
|
pthread_mutex_t mutex;
|
||||||
int releaseCount;
|
pthread_cond_t conditionVariable;
|
||||||
int count;
|
int releaseCount;
|
||||||
|
int count;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Create thread.
|
// Create thread.
|
||||||
CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
|
CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
|
||||||
|
|
||||||
// Wait for thread to finish.
|
// Wait for thread to finish.
|
||||||
void cutEndThread(CUTThread thread);
|
void cutEndThread(CUTThread thread);
|
||||||
|
|
||||||
// Wait for multiple threads.
|
// Wait for multiple threads.
|
||||||
void cutWaitForThreads(const CUTThread *threads, int num);
|
void cutWaitForThreads(const CUTThread *threads, int num);
|
||||||
|
|
||||||
// Create barrier.
|
// Create barrier.
|
||||||
CUTBarrier cutCreateBarrier(int releaseCount);
|
CUTBarrier cutCreateBarrier(int releaseCount);
|
||||||
|
|
||||||
// Increment barrier. (execution continues)
|
// Increment barrier. (execution continues)
|
||||||
void cutIncrementBarrier(CUTBarrier *barrier);
|
void cutIncrementBarrier(CUTBarrier *barrier);
|
||||||
|
|
||||||
// Wait for barrier release.
|
// Wait for barrier release.
|
||||||
void cutWaitForBarrier(CUTBarrier *barrier);
|
void cutWaitForBarrier(CUTBarrier *barrier);
|
||||||
|
|
||||||
// Destroy barrier
|
// Destroy barrier
|
||||||
void cutDestroyBarrier(CUTBarrier *barrier);
|
void cutDestroyBarrier(CUTBarrier *barrier);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif // MULTITHREADING_H
|
#endif // MULTITHREADING_H
|
||||||
|
@ -43,172 +43,173 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#include "multithreading.h"
|
#include "multithreading.h"
|
||||||
|
|
||||||
const int N_workloads = 8;
|
const int N_workloads = 8;
|
||||||
const int N_elements_per_workload = 100000;
|
const int N_elements_per_workload = 100000;
|
||||||
|
|
||||||
CUTBarrier thread_barrier;
|
CUTBarrier thread_barrier;
|
||||||
|
|
||||||
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
|
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
|
||||||
void *data);
|
|
||||||
|
|
||||||
struct heterogeneous_workload {
|
struct heterogeneous_workload
|
||||||
int id;
|
{
|
||||||
int cudaDeviceID;
|
int id;
|
||||||
|
int cudaDeviceID;
|
||||||
|
|
||||||
int *h_data;
|
int *h_data;
|
||||||
int *d_data;
|
int *d_data;
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
|
|
||||||
bool success;
|
bool success;
|
||||||
};
|
};
|
||||||
|
|
||||||
__global__ void incKernel(int *data, int N) {
|
__global__ void incKernel(int *data, int N)
|
||||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
{
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < N) data[i]++;
|
if (i < N)
|
||||||
|
data[i]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUT_THREADPROC launch(void *void_arg) {
|
CUT_THREADPROC launch(void *void_arg)
|
||||||
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
{
|
||||||
|
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
||||||
|
|
||||||
// Select GPU for this CPU thread
|
// Select GPU for this CPU thread
|
||||||
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
|
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
|
||||||
|
|
||||||
// Allocate Resources
|
// Allocate Resources
|
||||||
checkCudaErrors(cudaStreamCreate(&workload->stream));
|
checkCudaErrors(cudaStreamCreate(&workload->stream));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
|
||||||
cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
|
checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
|
||||||
checkCudaErrors(cudaHostAlloc(&workload->h_data,
|
|
||||||
N_elements_per_workload * sizeof(int),
|
|
||||||
cudaHostAllocPortable));
|
|
||||||
|
|
||||||
// CPU thread generates data
|
// CPU thread generates data
|
||||||
for (int i = 0; i < N_elements_per_workload; ++i) {
|
for (int i = 0; i < N_elements_per_workload; ++i) {
|
||||||
workload->h_data[i] = workload->id + i;
|
workload->h_data[i] = workload->id + i;
|
||||||
}
|
|
||||||
|
|
||||||
// Schedule work for GPU in CUDA stream without blocking the CPU thread
|
|
||||||
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
|
|
||||||
dim3 block(512);
|
|
||||||
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
|
|
||||||
N_elements_per_workload * sizeof(int),
|
|
||||||
cudaMemcpyHostToDevice, workload->stream));
|
|
||||||
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
|
|
||||||
N_elements_per_workload);
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
|
|
||||||
N_elements_per_workload * sizeof(int),
|
|
||||||
cudaMemcpyDeviceToHost, workload->stream));
|
|
||||||
|
|
||||||
// New in CUDA 5.0: Add a CPU callback which is called once all currently
|
|
||||||
// pending operations in the CUDA stream have finished
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
|
|
||||||
|
|
||||||
CUT_THREADEND;
|
|
||||||
// CPU thread end of life, GPU continues to process data...
|
|
||||||
}
|
|
||||||
|
|
||||||
CUT_THREADPROC postprocess(void *void_arg) {
|
|
||||||
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
|
||||||
// ... GPU is done with processing, continue on new CPU thread...
|
|
||||||
|
|
||||||
// Select GPU for this CPU thread
|
|
||||||
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
|
|
||||||
|
|
||||||
// CPU thread consumes results from GPU
|
|
||||||
workload->success = true;
|
|
||||||
|
|
||||||
for (int i = 0; i < N_workloads; ++i) {
|
|
||||||
workload->success &= workload->h_data[i] == i + workload->id + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Free Resources
|
|
||||||
checkCudaErrors(cudaFree(workload->d_data));
|
|
||||||
checkCudaErrors(cudaFreeHost(workload->h_data));
|
|
||||||
checkCudaErrors(cudaStreamDestroy(workload->stream));
|
|
||||||
|
|
||||||
// Signal the end of the heterogeneous workload to main thread
|
|
||||||
cutIncrementBarrier(&thread_barrier);
|
|
||||||
|
|
||||||
CUT_THREADEND;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
|
|
||||||
void *data) {
|
|
||||||
// Check status of GPU after stream operations are done
|
|
||||||
checkCudaErrors(status);
|
|
||||||
|
|
||||||
// Spawn new CPU worker thread and continue processing on the CPU
|
|
||||||
cutStartThread(postprocess, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
int N_gpus, max_gpus = 0;
|
|
||||||
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
|
|
||||||
|
|
||||||
printf("Starting simpleCallback\n");
|
|
||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
|
|
||||||
printf("Found %d CUDA capable GPUs\n", N_gpus);
|
|
||||||
|
|
||||||
if (N_gpus > 32) {
|
|
||||||
printf("simpleCallback only supports 32 GPU(s)\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int devid = 0; devid < N_gpus; devid++) {
|
|
||||||
int SMversion;
|
|
||||||
cudaDeviceProp deviceProp;
|
|
||||||
cudaSetDevice(devid);
|
|
||||||
cudaGetDeviceProperties(&deviceProp, devid);
|
|
||||||
SMversion = deviceProp.major << 4 + deviceProp.minor;
|
|
||||||
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
|
|
||||||
deviceProp.major, deviceProp.minor);
|
|
||||||
printf(", %s GPU Callback Functions\n",
|
|
||||||
(SMversion >= 0x11) ? "capable" : "NOT capable");
|
|
||||||
|
|
||||||
if (SMversion >= 0x11) {
|
|
||||||
gpuInfo[max_gpus++] = devid;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("%d GPUs available to run Callback Functions\n", max_gpus);
|
// Schedule work for GPU in CUDA stream without blocking the CPU thread
|
||||||
|
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
|
||||||
|
dim3 block(512);
|
||||||
|
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
|
||||||
|
|
||||||
heterogeneous_workload *workloads;
|
checkCudaErrors(cudaMemcpyAsync(workload->d_data,
|
||||||
workloads = (heterogeneous_workload *)malloc(N_workloads *
|
workload->h_data,
|
||||||
sizeof(heterogeneous_workload));
|
N_elements_per_workload * sizeof(int),
|
||||||
;
|
cudaMemcpyHostToDevice,
|
||||||
thread_barrier = cutCreateBarrier(N_workloads);
|
workload->stream));
|
||||||
|
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
|
||||||
|
checkCudaErrors(cudaMemcpyAsync(workload->h_data,
|
||||||
|
workload->d_data,
|
||||||
|
N_elements_per_workload * sizeof(int),
|
||||||
|
cudaMemcpyDeviceToHost,
|
||||||
|
workload->stream));
|
||||||
|
|
||||||
// Main thread spawns a CPU worker thread for each heterogeneous workload
|
// New in CUDA 5.0: Add a CPU callback which is called once all currently
|
||||||
printf("Starting %d heterogeneous computing workloads\n", N_workloads);
|
// pending operations in the CUDA stream have finished
|
||||||
|
checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
|
||||||
|
|
||||||
for (int i = 0; i < N_workloads; ++i) {
|
CUT_THREADEND;
|
||||||
workloads[i].id = i;
|
// CPU thread end of life, GPU continues to process data...
|
||||||
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
|
}
|
||||||
|
|
||||||
cutStartThread(launch, &workloads[i]);
|
CUT_THREADPROC postprocess(void *void_arg)
|
||||||
}
|
{
|
||||||
|
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
||||||
// Sleep until all workloads have finished
|
// ... GPU is done with processing, continue on new CPU thread...
|
||||||
cutWaitForBarrier(&thread_barrier);
|
|
||||||
printf("Total of %d workloads finished:\n", N_workloads);
|
// Select GPU for this CPU thread
|
||||||
|
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
|
||||||
bool success = true;
|
|
||||||
|
// CPU thread consumes results from GPU
|
||||||
for (int i = 0; i < N_workloads; ++i) {
|
workload->success = true;
|
||||||
success &= workloads[i].success;
|
|
||||||
}
|
for (int i = 0; i < N_workloads; ++i) {
|
||||||
|
workload->success &= workload->h_data[i] == i + workload->id + 1;
|
||||||
printf("%s\n", success ? "Success" : "Failure");
|
}
|
||||||
|
|
||||||
free(workloads);
|
// Free Resources
|
||||||
|
checkCudaErrors(cudaFree(workload->d_data));
|
||||||
exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
|
checkCudaErrors(cudaFreeHost(workload->h_data));
|
||||||
|
checkCudaErrors(cudaStreamDestroy(workload->stream));
|
||||||
|
|
||||||
|
// Signal the end of the heterogeneous workload to main thread
|
||||||
|
cutIncrementBarrier(&thread_barrier);
|
||||||
|
|
||||||
|
CUT_THREADEND;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
|
||||||
|
{
|
||||||
|
// Check status of GPU after stream operations are done
|
||||||
|
checkCudaErrors(status);
|
||||||
|
|
||||||
|
// Spawn new CPU worker thread and continue processing on the CPU
|
||||||
|
cutStartThread(postprocess, data);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int N_gpus, max_gpus = 0;
|
||||||
|
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
|
||||||
|
|
||||||
|
printf("Starting simpleCallback\n");
|
||||||
|
|
||||||
|
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
|
||||||
|
printf("Found %d CUDA capable GPUs\n", N_gpus);
|
||||||
|
|
||||||
|
if (N_gpus > 32) {
|
||||||
|
printf("simpleCallback only supports 32 GPU(s)\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int devid = 0; devid < N_gpus; devid++) {
|
||||||
|
int SMversion;
|
||||||
|
cudaDeviceProp deviceProp;
|
||||||
|
cudaSetDevice(devid);
|
||||||
|
cudaGetDeviceProperties(&deviceProp, devid);
|
||||||
|
SMversion = deviceProp.major << 4 + deviceProp.minor;
|
||||||
|
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
|
||||||
|
printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
|
||||||
|
|
||||||
|
if (SMversion >= 0x11) {
|
||||||
|
gpuInfo[max_gpus++] = devid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%d GPUs available to run Callback Functions\n", max_gpus);
|
||||||
|
|
||||||
|
heterogeneous_workload *workloads;
|
||||||
|
workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
|
||||||
|
;
|
||||||
|
thread_barrier = cutCreateBarrier(N_workloads);
|
||||||
|
|
||||||
|
// Main thread spawns a CPU worker thread for each heterogeneous workload
|
||||||
|
printf("Starting %d heterogeneous computing workloads\n", N_workloads);
|
||||||
|
|
||||||
|
for (int i = 0; i < N_workloads; ++i) {
|
||||||
|
workloads[i].id = i;
|
||||||
|
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
|
||||||
|
|
||||||
|
cutStartThread(launch, &workloads[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sleep until all workloads have finished
|
||||||
|
cutWaitForBarrier(&thread_barrier);
|
||||||
|
printf("Total of %d workloads finished:\n", N_workloads);
|
||||||
|
|
||||||
|
bool success = true;
|
||||||
|
|
||||||
|
for (int i = 0; i < N_workloads; ++i) {
|
||||||
|
success &= workloads[i].success;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s\n", success ? "Success" : "Failure");
|
||||||
|
|
||||||
|
free(workloads);
|
||||||
|
|
||||||
|
exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaDeviceSynchronize, cudaGetErrorString
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -38,8 +38,8 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
using namespace cooperative_groups;
|
using namespace cooperative_groups;
|
||||||
|
|
||||||
@ -49,35 +49,36 @@ using namespace cooperative_groups;
|
|||||||
* calculates the sum of val across the group g. The workspace array, x,
|
* calculates the sum of val across the group g. The workspace array, x,
|
||||||
* must be large enough to contain g.size() integers.
|
* must be large enough to contain g.size() integers.
|
||||||
*/
|
*/
|
||||||
__device__ int sumReduction(thread_group g, int *x, int val) {
|
__device__ int sumReduction(thread_group g, int *x, int val)
|
||||||
// rank of this thread in the group
|
{
|
||||||
int lane = g.thread_rank();
|
// rank of this thread in the group
|
||||||
|
int lane = g.thread_rank();
|
||||||
|
|
||||||
// for each iteration of this loop, the number of threads active in the
|
// for each iteration of this loop, the number of threads active in the
|
||||||
// reduction, i, is halved, and each active thread (with index [lane])
|
// reduction, i, is halved, and each active thread (with index [lane])
|
||||||
// performs a single summation of it's own value with that
|
// performs a single summation of it's own value with that
|
||||||
// of a "partner" (with index [lane+i]).
|
// of a "partner" (with index [lane+i]).
|
||||||
for (int i = g.size() / 2; i > 0; i /= 2) {
|
for (int i = g.size() / 2; i > 0; i /= 2) {
|
||||||
// store value for this thread in temporary array
|
// store value for this thread in temporary array
|
||||||
x[lane] = val;
|
x[lane] = val;
|
||||||
|
|
||||||
// synchronize all threads in group
|
// synchronize all threads in group
|
||||||
g.sync();
|
g.sync();
|
||||||
|
|
||||||
if (lane < i)
|
if (lane < i)
|
||||||
// active threads perform summation of their value with
|
// active threads perform summation of their value with
|
||||||
// their partner's value
|
// their partner's value
|
||||||
val += x[lane + i];
|
val += x[lane + i];
|
||||||
|
|
||||||
// synchronize all threads in group
|
// synchronize all threads in group
|
||||||
g.sync();
|
g.sync();
|
||||||
}
|
}
|
||||||
|
|
||||||
// master thread in group returns result, and others return -1.
|
// master thread in group returns result, and others return -1.
|
||||||
if (g.thread_rank() == 0)
|
if (g.thread_rank() == 0)
|
||||||
return val;
|
return val;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
|
|||||||
*
|
*
|
||||||
* Creates cooperative groups and performs reductions
|
* Creates cooperative groups and performs reductions
|
||||||
*/
|
*/
|
||||||
__global__ void cgkernel() {
|
__global__ void cgkernel()
|
||||||
// threadBlockGroup includes all threads in the block
|
{
|
||||||
thread_block threadBlockGroup = this_thread_block();
|
// threadBlockGroup includes all threads in the block
|
||||||
int threadBlockGroupSize = threadBlockGroup.size();
|
thread_block threadBlockGroup = this_thread_block();
|
||||||
|
int threadBlockGroupSize = threadBlockGroup.size();
|
||||||
|
|
||||||
// workspace array in shared memory required for reduction
|
// workspace array in shared memory required for reduction
|
||||||
extern __shared__ int workspace[];
|
extern __shared__ int workspace[];
|
||||||
|
|
||||||
int input, output, expectedOutput;
|
int input, output, expectedOutput;
|
||||||
|
|
||||||
// input to reduction, for each thread, is its' rank in the group
|
// input to reduction, for each thread, is its' rank in the group
|
||||||
input = threadBlockGroup.thread_rank();
|
input = threadBlockGroup.thread_rank();
|
||||||
|
|
||||||
// expected output from analytical formula (n-1)(n)/2
|
// expected output from analytical formula (n-1)(n)/2
|
||||||
// (noting that indexing starts at 0 rather than 1)
|
// (noting that indexing starts at 0 rather than 1)
|
||||||
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
|
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
|
||||||
|
|
||||||
// perform reduction
|
// perform reduction
|
||||||
output = sumReduction(threadBlockGroup, workspace, input);
|
output = sumReduction(threadBlockGroup, workspace, input);
|
||||||
|
|
||||||
// master thread in group prints out result
|
// master thread in group prints out result
|
||||||
if (threadBlockGroup.thread_rank() == 0) {
|
if (threadBlockGroup.thread_rank() == 0) {
|
||||||
printf(
|
printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
|
||||||
" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
|
(int)threadBlockGroup.size() - 1,
|
||||||
(int)threadBlockGroup.size() - 1, output, expectedOutput);
|
output,
|
||||||
|
expectedOutput);
|
||||||
|
|
||||||
printf(" Now creating %d groups, each of size 16 threads:\n\n",
|
printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
|
||||||
(int)threadBlockGroup.size() / 16);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
threadBlockGroup.sync();
|
threadBlockGroup.sync();
|
||||||
|
|
||||||
// each tiledPartition16 group includes 16 threads
|
// each tiledPartition16 group includes 16 threads
|
||||||
thread_block_tile<16> tiledPartition16 =
|
thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
|
||||||
tiled_partition<16>(threadBlockGroup);
|
|
||||||
|
|
||||||
// This offset allows each group to have its own unique area in the workspace
|
// This offset allows each group to have its own unique area in the workspace
|
||||||
// array
|
// array
|
||||||
int workspaceOffset =
|
int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
|
||||||
threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
|
|
||||||
|
|
||||||
// input to reduction, for each thread, is its' rank in the group
|
// input to reduction, for each thread, is its' rank in the group
|
||||||
input = tiledPartition16.thread_rank();
|
input = tiledPartition16.thread_rank();
|
||||||
|
|
||||||
// expected output from analytical formula (n-1)(n)/2
|
// expected output from analytical formula (n-1)(n)/2
|
||||||
// (noting that indexing starts at 0 rather than 1)
|
// (noting that indexing starts at 0 rather than 1)
|
||||||
expectedOutput = 15 * 16 / 2;
|
expectedOutput = 15 * 16 / 2;
|
||||||
|
|
||||||
// Perform reduction
|
// Perform reduction
|
||||||
output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
|
output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
|
||||||
|
|
||||||
// each master thread prints out result
|
// each master thread prints out result
|
||||||
if (tiledPartition16.thread_rank() == 0)
|
if (tiledPartition16.thread_rank() == 0)
|
||||||
printf(
|
printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
|
||||||
" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
|
"(expected %d)\n",
|
||||||
"(expected %d)\n",
|
output,
|
||||||
output, expectedOutput);
|
expectedOutput);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Host main routine
|
* Host main routine
|
||||||
*/
|
*/
|
||||||
int main() {
|
int main()
|
||||||
// Error code to check return values for CUDA calls
|
{
|
||||||
cudaError_t err;
|
// Error code to check return values for CUDA calls
|
||||||
|
cudaError_t err;
|
||||||
|
|
||||||
// Launch the kernel
|
// Launch the kernel
|
||||||
|
|
||||||
int blocksPerGrid = 1;
|
int blocksPerGrid = 1;
|
||||||
int threadsPerBlock = 64;
|
int threadsPerBlock = 64;
|
||||||
|
|
||||||
printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
|
printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
|
||||||
|
|
||||||
// we use the optional third argument to specify the size
|
// we use the optional third argument to specify the size
|
||||||
// of shared memory required in the kernel
|
// of shared memory required in the kernel
|
||||||
cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
|
cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
|
||||||
err = cudaDeviceSynchronize();
|
err = cudaDeviceSynchronize();
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
|
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
exit(EXIT_FAILURE);
|
||||||
exit(EXIT_FAILURE);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
printf("\n...Done.\n\n");
|
printf("\n...Done.\n\n");
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
|
@ -27,6 +27,6 @@ cudaMemcpy, cudaCreateChannelDesc, cudaFreeArray, cudaFree, cudaPitchedPtr, cuda
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
@ -26,27 +26,27 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This sample demonstrates how to use texture fetches from layered 2D textures
|
* This sample demonstrates how to use texture fetches from layered 2D textures
|
||||||
* in CUDA C
|
* in CUDA C
|
||||||
*
|
*
|
||||||
* This sample first generates a 3D input data array for the layered texture
|
* This sample first generates a 3D input data array for the layered texture
|
||||||
* and the expected output. Then it starts CUDA C kernels, one for each layer,
|
* and the expected output. Then it starts CUDA C kernels, one for each layer,
|
||||||
* which fetch their layer's texture data (using normalized texture coordinates)
|
* which fetch their layer's texture data (using normalized texture coordinates)
|
||||||
* transform it to the expected output, and write it to a 3D output data array.
|
* transform it to the expected output, and write it to a 3D output data array.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes CUDA
|
// includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
static const char *sSDKname = "simpleCubemapTexture";
|
static const char *sSDKname = "simpleCubemapTexture";
|
||||||
|
|
||||||
@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture";
|
|||||||
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
|
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void transformKernel(float *g_odata, int width,
|
__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
|
||||||
cudaTextureObject_t tex) {
|
{
|
||||||
// calculate this thread's data point
|
// calculate this thread's data point
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
// 0.5f offset and division are necessary to access the original data points
|
// 0.5f offset and division are necessary to access the original data points
|
||||||
// in the texture (such that bilinear interpolation will not be activated).
|
// in the texture (such that bilinear interpolation will not be activated).
|
||||||
// For details, see also CUDA Programming Guide, Appendix D
|
// For details, see also CUDA Programming Guide, Appendix D
|
||||||
|
|
||||||
float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
|
float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
|
||||||
float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
|
float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
|
||||||
|
|
||||||
float cx, cy, cz;
|
float cx, cy, cz;
|
||||||
|
|
||||||
for (unsigned int face = 0; face < 6; face++) {
|
for (unsigned int face = 0; face < 6; face++) {
|
||||||
// Layer 0 is positive X face
|
// Layer 0 is positive X face
|
||||||
if (face == 0) {
|
if (face == 0) {
|
||||||
cx = 1;
|
cx = 1;
|
||||||
cy = -v;
|
cy = -v;
|
||||||
cz = -u;
|
cz = -u;
|
||||||
}
|
}
|
||||||
// Layer 1 is negative X face
|
// Layer 1 is negative X face
|
||||||
else if (face == 1) {
|
else if (face == 1) {
|
||||||
cx = -1;
|
cx = -1;
|
||||||
cy = -v;
|
cy = -v;
|
||||||
cz = u;
|
cz = u;
|
||||||
}
|
}
|
||||||
// Layer 2 is positive Y face
|
// Layer 2 is positive Y face
|
||||||
else if (face == 2) {
|
else if (face == 2) {
|
||||||
cx = u;
|
cx = u;
|
||||||
cy = 1;
|
cy = 1;
|
||||||
cz = v;
|
cz = v;
|
||||||
}
|
}
|
||||||
// Layer 3 is negative Y face
|
// Layer 3 is negative Y face
|
||||||
else if (face == 3) {
|
else if (face == 3) {
|
||||||
cx = u;
|
cx = u;
|
||||||
cy = -1;
|
cy = -1;
|
||||||
cz = -v;
|
cz = -v;
|
||||||
}
|
}
|
||||||
// Layer 4 is positive Z face
|
// Layer 4 is positive Z face
|
||||||
else if (face == 4) {
|
else if (face == 4) {
|
||||||
cx = u;
|
cx = u;
|
||||||
cy = -v;
|
cy = -v;
|
||||||
cz = 1;
|
cz = 1;
|
||||||
}
|
}
|
||||||
// Layer 4 is negative Z face
|
// Layer 4 is negative Z face
|
||||||
else if (face == 5) {
|
else if (face == 5) {
|
||||||
cx = -u;
|
cx = -u;
|
||||||
cy = -v;
|
cy = -v;
|
||||||
cz = -1;
|
cz = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// read from texture, do expected transformation and write to global memory
|
// read from texture, do expected transformation and write to global memory
|
||||||
g_odata[face * width * width + y * width + x] =
|
g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
|
||||||
-texCubemap<float>(tex, cx, cy, cz);
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
// use command-line specified CUDA device, otherwise use device with highest
|
{
|
||||||
// Gflops/s
|
// use command-line specified CUDA device, otherwise use device with highest
|
||||||
int devID = findCudaDevice(argc, (const char **)argv);
|
// Gflops/s
|
||||||
|
int devID = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
bool bResult = true;
|
bool bResult = true;
|
||||||
|
|
||||||
// get number of SMs on this GPU
|
// get number of SMs on this GPU
|
||||||
cudaDeviceProp deviceProps;
|
cudaDeviceProp deviceProps;
|
||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||||
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
|
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
|
||||||
deviceProps.multiProcessorCount);
|
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
|
||||||
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
|
|
||||||
|
|
||||||
if (deviceProps.major < 2) {
|
if (deviceProps.major < 2) {
|
||||||
printf(
|
printf("%s requires SM 2.0 or higher for support of Texture Arrays. Test "
|
||||||
"%s requires SM 2.0 or higher for support of Texture Arrays. Test "
|
"will exit... \n",
|
||||||
"will exit... \n",
|
sSDKname);
|
||||||
sSDKname);
|
|
||||||
|
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
}
|
|
||||||
|
|
||||||
// generate input data for layered texture
|
|
||||||
unsigned int width = 64, num_faces = 6, num_layers = 1;
|
|
||||||
unsigned int cubemap_size = width * width * num_faces;
|
|
||||||
unsigned int size = cubemap_size * num_layers * sizeof(float);
|
|
||||||
float *h_data = (float *)malloc(size);
|
|
||||||
|
|
||||||
for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
|
|
||||||
h_data[i] = (float)i;
|
|
||||||
}
|
|
||||||
|
|
||||||
// this is the expected transformation of the input data (the expected output)
|
|
||||||
float *h_data_ref = (float *)malloc(size);
|
|
||||||
|
|
||||||
for (unsigned int layer = 0; layer < num_layers; layer++) {
|
|
||||||
for (int i = 0; i < (int)(cubemap_size); i++) {
|
|
||||||
h_data_ref[layer * cubemap_size + i] =
|
|
||||||
-h_data[layer * cubemap_size + i] + layer;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// allocate device memory for result
|
// generate input data for layered texture
|
||||||
float *d_data = NULL;
|
unsigned int width = 64, num_faces = 6, num_layers = 1;
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_data, size));
|
unsigned int cubemap_size = width * width * num_faces;
|
||||||
|
unsigned int size = cubemap_size * num_layers * sizeof(float);
|
||||||
|
float *h_data = (float *)malloc(size);
|
||||||
|
|
||||||
// allocate array and copy image data
|
for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
|
||||||
cudaChannelFormatDesc channelDesc =
|
h_data[i] = (float)i;
|
||||||
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
}
|
||||||
cudaArray *cu_3darray;
|
|
||||||
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
|
|
||||||
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
|
|
||||||
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
|
|
||||||
make_cudaExtent(width, width, num_faces),
|
|
||||||
cudaArrayCubemap));
|
|
||||||
cudaMemcpy3DParms myparms = {0};
|
|
||||||
myparms.srcPos = make_cudaPos(0, 0, 0);
|
|
||||||
myparms.dstPos = make_cudaPos(0, 0, 0);
|
|
||||||
myparms.srcPtr =
|
|
||||||
make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
|
|
||||||
myparms.dstArray = cu_3darray;
|
|
||||||
myparms.extent = make_cudaExtent(width, width, num_faces);
|
|
||||||
myparms.kind = cudaMemcpyHostToDevice;
|
|
||||||
checkCudaErrors(cudaMemcpy3D(&myparms));
|
|
||||||
|
|
||||||
cudaTextureObject_t tex;
|
// this is the expected transformation of the input data (the expected output)
|
||||||
cudaResourceDesc texRes;
|
float *h_data_ref = (float *)malloc(size);
|
||||||
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
|
||||||
|
|
||||||
texRes.resType = cudaResourceTypeArray;
|
for (unsigned int layer = 0; layer < num_layers; layer++) {
|
||||||
texRes.res.array.array = cu_3darray;
|
for (int i = 0; i < (int)(cubemap_size); i++) {
|
||||||
|
h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cudaTextureDesc texDescr;
|
// allocate device memory for result
|
||||||
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
float *d_data = NULL;
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&d_data, size));
|
||||||
|
|
||||||
texDescr.normalizedCoords = true;
|
// allocate array and copy image data
|
||||||
texDescr.filterMode = cudaFilterModeLinear;
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||||
texDescr.addressMode[0] = cudaAddressModeWrap;
|
cudaArray *cu_3darray;
|
||||||
texDescr.addressMode[1] = cudaAddressModeWrap;
|
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
|
||||||
texDescr.addressMode[2] = cudaAddressModeWrap;
|
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
|
||||||
texDescr.readMode = cudaReadModeElementType;
|
checkCudaErrors(
|
||||||
|
cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
|
||||||
|
cudaMemcpy3DParms myparms = {0};
|
||||||
|
myparms.srcPos = make_cudaPos(0, 0, 0);
|
||||||
|
myparms.dstPos = make_cudaPos(0, 0, 0);
|
||||||
|
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
|
||||||
|
myparms.dstArray = cu_3darray;
|
||||||
|
myparms.extent = make_cudaExtent(width, width, num_faces);
|
||||||
|
myparms.kind = cudaMemcpyHostToDevice;
|
||||||
|
checkCudaErrors(cudaMemcpy3D(&myparms));
|
||||||
|
|
||||||
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
cudaTextureObject_t tex;
|
||||||
|
cudaResourceDesc texRes;
|
||||||
|
memset(&texRes, 0, sizeof(cudaResourceDesc));
|
||||||
|
|
||||||
dim3 dimBlock(8, 8, 1);
|
texRes.resType = cudaResourceTypeArray;
|
||||||
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
|
texRes.res.array.array = cu_3darray;
|
||||||
|
|
||||||
printf(
|
cudaTextureDesc texDescr;
|
||||||
"Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
||||||
"block has 8 x 8 threads\n",
|
|
||||||
width, num_layers, dimGrid.x, dimGrid.y);
|
|
||||||
|
|
||||||
transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
|
texDescr.normalizedCoords = true;
|
||||||
tex); // warmup (for better timing)
|
texDescr.filterMode = cudaFilterModeLinear;
|
||||||
|
texDescr.addressMode[0] = cudaAddressModeWrap;
|
||||||
|
texDescr.addressMode[1] = cudaAddressModeWrap;
|
||||||
|
texDescr.addressMode[2] = cudaAddressModeWrap;
|
||||||
|
texDescr.readMode = cudaReadModeElementType;
|
||||||
|
|
||||||
// check if kernel execution generated an error
|
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
||||||
getLastCudaError("warmup Kernel execution failed");
|
|
||||||
|
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
dim3 dimBlock(8, 8, 1);
|
||||||
|
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
|
||||||
|
|
||||||
StopWatchInterface *timer = NULL;
|
printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
|
||||||
sdkCreateTimer(&timer);
|
"block has 8 x 8 threads\n",
|
||||||
sdkStartTimer(&timer);
|
width,
|
||||||
|
num_layers,
|
||||||
|
dimGrid.x,
|
||||||
|
dimGrid.y);
|
||||||
|
|
||||||
// execute the kernel
|
transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
|
||||||
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
|
tex); // warmup (for better timing)
|
||||||
|
|
||||||
// check if kernel execution generated an error
|
// check if kernel execution generated an error
|
||||||
getLastCudaError("Kernel execution failed");
|
getLastCudaError("warmup Kernel execution failed");
|
||||||
|
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
sdkStopTimer(&timer);
|
|
||||||
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
|
|
||||||
printf("%.2f Mtexlookups/sec\n",
|
|
||||||
(cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
|
|
||||||
sdkDeleteTimer(&timer);
|
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
StopWatchInterface *timer = NULL;
|
||||||
float *h_odata = (float *)malloc(size);
|
sdkCreateTimer(&timer);
|
||||||
// copy result from device to host
|
sdkStartTimer(&timer);
|
||||||
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
// write regression file if necessary
|
// execute the kernel
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
|
||||||
// write file for regression test
|
|
||||||
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
|
// check if kernel execution generated an error
|
||||||
false);
|
getLastCudaError("Kernel execution failed");
|
||||||
} else {
|
|
||||||
printf("Comparing kernel output to expected data\n");
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
|
sdkStopTimer(&timer);
|
||||||
|
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
|
||||||
|
printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
|
||||||
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
|
// allocate mem for the result on host side
|
||||||
|
float *h_odata = (float *)malloc(size);
|
||||||
|
// copy result from device to host
|
||||||
|
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
|
// write regression file if necessary
|
||||||
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
|
// write file for regression test
|
||||||
|
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
printf("Comparing kernel output to expected data\n");
|
||||||
|
|
||||||
#define MIN_EPSILON_ERROR 5e-3f
|
#define MIN_EPSILON_ERROR 5e-3f
|
||||||
bResult =
|
bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
|
||||||
compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// cleanup memory
|
// cleanup memory
|
||||||
free(h_data);
|
free(h_data);
|
||||||
free(h_data_ref);
|
free(h_data_ref);
|
||||||
free(h_odata);
|
free(h_odata);
|
||||||
|
|
||||||
checkCudaErrors(cudaDestroyTextureObject(tex));
|
checkCudaErrors(cudaDestroyTextureObject(tex));
|
||||||
checkCudaErrors(cudaFree(d_data));
|
checkCudaErrors(cudaFree(d_data));
|
||||||
checkCudaErrors(cudaFreeArray(cu_3darray));
|
checkCudaErrors(cudaFreeArray(cu_3darray));
|
||||||
|
|
||||||
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,10 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
|
|||||||
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72 75 80 86 87 89 90 100 101 120)
|
||||||
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
|
||||||
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
if(ENABLE_CUDA_DEBUG)
|
||||||
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (expensive)
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -G") # enable cuda-gdb (may significantly affect performance on some targets)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo") # add line information to all builds for debug tools (exclusive to -G option)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include directories and libraries
|
# Include directories and libraries
|
||||||
@ -40,6 +42,12 @@ target_link_libraries(simpleDrvRuntime PUBLIC
|
|||||||
set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/vectorAdd_kernel64.fatbin")
|
set(CUDA_FATBIN_FILE "${CMAKE_CURRENT_BINARY_DIR}/vectorAdd_kernel64.fatbin")
|
||||||
set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/vectorAdd_kernel.cu")
|
set(CUDA_KERNEL_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/vectorAdd_kernel.cu")
|
||||||
|
|
||||||
|
# Construct GENCODE_FLAGS explicitly from CUDA architectures
|
||||||
|
set(GENCODE_FLAGS "")
|
||||||
|
foreach(arch ${CMAKE_CUDA_ARCHITECTURES})
|
||||||
|
list(APPEND GENCODE_FLAGS "-gencode=arch=compute_${arch},code=sm_${arch}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
add_custom_command(
|
add_custom_command(
|
||||||
OUTPUT ${CUDA_FATBIN_FILE}
|
OUTPUT ${CUDA_FATBIN_FILE}
|
||||||
COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}
|
COMMAND ${CMAKE_CUDA_COMPILER} ${INCLUDES} ${ALL_CCFLAGS} -Wno-deprecated-gpu-targets ${GENCODE_FLAGS} -o ${CUDA_FATBIN_FILE} -fatbin ${CUDA_KERNEL_SOURCE}
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user