mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-12-16 10:37:48 +08:00
Merge branch 'master' into cuda_a_dev
This commit is contained in:
commit
eddc6fd7e1
49
.clang-format
Normal file
49
.clang-format
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
---
|
||||||
|
AccessModifierOffset: -4
|
||||||
|
AlignAfterOpenBracket: Align
|
||||||
|
AlignConsecutiveAssignments: Consecutive
|
||||||
|
AlignConsecutiveDeclarations: Consecutive
|
||||||
|
AlignConsecutiveMacros: Consecutive
|
||||||
|
AlignEscapedNewlines: Left
|
||||||
|
AlignOperands: AlignAfterOperator
|
||||||
|
AlignTrailingComments: true
|
||||||
|
AllowAllParametersOfDeclarationOnNextLine: false
|
||||||
|
BinPackArguments: false
|
||||||
|
BinPackParameters: false
|
||||||
|
BraceWrapping:
|
||||||
|
AfterClass: true
|
||||||
|
AfterControlStatement: false
|
||||||
|
AfterExternBlock: true
|
||||||
|
AfterFunction: true
|
||||||
|
AfterStruct: true
|
||||||
|
AfterUnion: true
|
||||||
|
BeforeCatch: true
|
||||||
|
BeforeElse: true
|
||||||
|
IndentBraces: false
|
||||||
|
BreakBeforeBraces: Custom
|
||||||
|
BreakBeforeConceptDeclarations: true
|
||||||
|
BreakBeforeBinaryOperators: NonAssignment
|
||||||
|
BreakBeforeTernaryOperators: true
|
||||||
|
BreakConstructorInitializers: BeforeComma
|
||||||
|
BreakInheritanceList: BeforeComma
|
||||||
|
ColumnLimit: 120
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
FixNamespaceComments: true
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^<.*>'
|
||||||
|
Priority: 1
|
||||||
|
- Regex: '^".*"'
|
||||||
|
Priority: 2
|
||||||
|
SortIncludes: true
|
||||||
|
IncludeBlocks: Regroup
|
||||||
|
IndentWidth: 4
|
||||||
|
MaxEmptyLinesToKeep: 2
|
||||||
|
PointerAlignment: Right
|
||||||
|
SortUsingDeclarations: true
|
||||||
|
SpaceAfterCStyleCast: false
|
||||||
|
SpaceBeforeAssignmentOperators: true
|
||||||
|
SpaceBeforeParens: ControlStatements
|
||||||
|
Standard: c++17
|
||||||
|
TabWidth: 4
|
||||||
|
UseTab: Never
|
||||||
|
...
|
||||||
100
.pre-commit-config.yaml
Normal file
100
.pre-commit-config.yaml
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
# Copyright (c) 2024, NVIDIA CORPORATION.
|
||||||
|
ci:
|
||||||
|
autofix_commit_msg: |
|
||||||
|
[pre-commit.ci] auto code formatting
|
||||||
|
autofix_prs: false
|
||||||
|
autoupdate_branch: ''
|
||||||
|
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
|
||||||
|
autoupdate_schedule: quarterly
|
||||||
|
skip: []
|
||||||
|
submodules: false
|
||||||
|
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
|
rev: v5.0.0
|
||||||
|
hooks:
|
||||||
|
- id: end-of-file-fixer
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.raw$|
|
||||||
|
.*\.bin$|
|
||||||
|
.*\.dat$|
|
||||||
|
.*\.nv12$|
|
||||||
|
data/.*|
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.txt$|
|
||||||
|
.*\.md$|
|
||||||
|
.*\.cpp$|
|
||||||
|
.*\.cxx$|
|
||||||
|
.*\.hpp$|
|
||||||
|
.*\.h$|
|
||||||
|
.*\.cu$|
|
||||||
|
.*\.cuh$
|
||||||
|
)
|
||||||
|
- id: mixed-line-ending
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.raw$|
|
||||||
|
.*\.bin$|
|
||||||
|
.*\.dat$|
|
||||||
|
.*\.nv12$|
|
||||||
|
data/.*|
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.txt$|
|
||||||
|
.*\.md$|
|
||||||
|
.*\.cpp$|
|
||||||
|
.*\.cxx$|
|
||||||
|
.*\.hpp$|
|
||||||
|
.*\.h$|
|
||||||
|
.*\.cu$|
|
||||||
|
.*\.cuh$
|
||||||
|
)
|
||||||
|
- id: trailing-whitespace
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.raw$|
|
||||||
|
.*\.bin$|
|
||||||
|
.*\.dat$|
|
||||||
|
.*\.nv12$|
|
||||||
|
data/.*|
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
.*\.txt$|
|
||||||
|
.*\.md$|
|
||||||
|
.*\.cpp$|
|
||||||
|
.*\.cxx$|
|
||||||
|
.*\.hpp$|
|
||||||
|
.*\.h$|
|
||||||
|
.*\.cu$|
|
||||||
|
.*\.cuh$
|
||||||
|
)
|
||||||
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
|
rev: v19.1.6
|
||||||
|
hooks:
|
||||||
|
- id: clang-format
|
||||||
|
types_or: [file]
|
||||||
|
files: |
|
||||||
|
(?x)^(
|
||||||
|
^.*\.c$|
|
||||||
|
^.*\.cpp$|
|
||||||
|
^.*\.cu$|
|
||||||
|
^.*\.cuh$|
|
||||||
|
^.*\.cxx$|
|
||||||
|
^.*\.h$|
|
||||||
|
^.*\.hpp$|
|
||||||
|
^.*\.inl$|
|
||||||
|
^.*\.mm$
|
||||||
|
)
|
||||||
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
Common/.*
|
||||||
|
)
|
||||||
|
args: ["-fallback-style=none", "-style=file", "-i"]
|
||||||
@ -1,6 +1,6 @@
|
|||||||
# CUDA Samples
|
# CUDA Samples
|
||||||
|
|
||||||
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.8](https://developer.nvidia.com/cuda-downloads).
|
Samples for CUDA Developers which demonstrates features in CUDA Toolkit. This version supports [CUDA Toolkit 12.9](https://developer.nvidia.com/cuda-downloads).
|
||||||
|
|
||||||
## Release Notes
|
## Release Notes
|
||||||
|
|
||||||
|
|||||||
@ -31,10 +31,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// system includes
|
// system includes
|
||||||
|
#include <algorithm>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#else
|
#else
|
||||||
@ -58,15 +58,25 @@ double drand48() { return double(rand()) / RAND_MAX; }
|
|||||||
const char *sSDKname = "UnifiedMemoryStreams";
|
const char *sSDKname = "UnifiedMemoryStreams";
|
||||||
|
|
||||||
// simple task
|
// simple task
|
||||||
template <typename T>
|
template <typename T> struct Task
|
||||||
struct Task {
|
{
|
||||||
unsigned int size, id;
|
unsigned int size, id;
|
||||||
T *data;
|
T *data;
|
||||||
T *result;
|
T *result;
|
||||||
T *vector;
|
T *vector;
|
||||||
|
|
||||||
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
|
Task()
|
||||||
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
|
: size(0)
|
||||||
|
, id(0)
|
||||||
|
, data(NULL)
|
||||||
|
, result(NULL)
|
||||||
|
, vector(NULL) {};
|
||||||
|
Task(unsigned int s)
|
||||||
|
: size(s)
|
||||||
|
, id(0)
|
||||||
|
, data(NULL)
|
||||||
|
, result(NULL)
|
||||||
|
{
|
||||||
// allocate unified memory -- the operation performed in this example will
|
// allocate unified memory -- the operation performed in this example will
|
||||||
// be a DGEMV
|
// be a DGEMV
|
||||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||||
@ -75,7 +85,8 @@ struct Task {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
~Task() {
|
~Task()
|
||||||
|
{
|
||||||
// ensure all memory is deallocated
|
// ensure all memory is deallocated
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
checkCudaErrors(cudaFree(data));
|
checkCudaErrors(cudaFree(data));
|
||||||
@ -83,7 +94,8 @@ struct Task {
|
|||||||
checkCudaErrors(cudaFree(vector));
|
checkCudaErrors(cudaFree(vector));
|
||||||
}
|
}
|
||||||
|
|
||||||
void allocate(const unsigned int s, const unsigned int unique_id) {
|
void allocate(const unsigned int s, const unsigned int unique_id)
|
||||||
|
{
|
||||||
// allocate unified memory outside of constructor
|
// allocate unified memory outside of constructor
|
||||||
id = unique_id;
|
id = unique_id;
|
||||||
size = s;
|
size = s;
|
||||||
@ -105,7 +117,8 @@ struct Task {
|
|||||||
};
|
};
|
||||||
|
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
struct threadData_t {
|
struct threadData_t
|
||||||
|
{
|
||||||
int tid;
|
int tid;
|
||||||
Task<double> *TaskListPtr;
|
Task<double> *TaskListPtr;
|
||||||
cudaStream_t *streams;
|
cudaStream_t *streams;
|
||||||
@ -117,8 +130,8 @@ typedef struct threadData_t threadData;
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// simple host dgemv: assume data is in row-major format and square
|
// simple host dgemv: assume data is in row-major format and square
|
||||||
template <typename T>
|
template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
|
||||||
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
|
{
|
||||||
// rows
|
// rows
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
result[i] *= beta;
|
result[i] *= beta;
|
||||||
@ -131,7 +144,8 @@ void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
|
|||||||
|
|
||||||
// execute a single task on either host or device depending on size
|
// execute a single task on either host or device depending on size
|
||||||
#ifdef USE_PTHREADS
|
#ifdef USE_PTHREADS
|
||||||
void *execute(void *inpArgs) {
|
void *execute(void *inpArgs)
|
||||||
|
{
|
||||||
threadData *dataPtr = (threadData *)inpArgs;
|
threadData *dataPtr = (threadData *)inpArgs;
|
||||||
cudaStream_t *stream = dataPtr->streams;
|
cudaStream_t *stream = dataPtr->streams;
|
||||||
cublasHandle_t *handle = dataPtr->handles;
|
cublasHandle_t *handle = dataPtr->handles;
|
||||||
@ -142,92 +156,75 @@ void *execute(void *inpArgs) {
|
|||||||
|
|
||||||
if (t.size < 100) {
|
if (t.size < 100) {
|
||||||
// perform on host
|
// perform on host
|
||||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
|
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
|
||||||
t.size);
|
|
||||||
|
|
||||||
// attach managed memory to a (dummy) stream to allow host access while
|
// attach managed memory to a (dummy) stream to allow host access while
|
||||||
// the device is running
|
// the device is running
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
|
||||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||||
// call the host operation
|
// call the host operation
|
||||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// perform on device
|
// perform on device
|
||||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
|
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
|
||||||
t.size);
|
|
||||||
double one = 1.0;
|
double one = 1.0;
|
||||||
double zero = 0.0;
|
double zero = 0.0;
|
||||||
|
|
||||||
// attach managed memory to my stream
|
// attach managed memory to my stream
|
||||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
|
|
||||||
cudaMemAttachSingle));
|
|
||||||
// call the device operation
|
// call the device operation
|
||||||
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
|
checkCudaErrors(cublasDgemv(
|
||||||
&one, t.data, t.size, t.vector, 1, &zero,
|
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
|
||||||
t.result, 1));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_exit(NULL);
|
pthread_exit(NULL);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
template <typename T>
|
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
|
||||||
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
|
{
|
||||||
int tid) {
|
|
||||||
if (t.size < 100) {
|
if (t.size < 100) {
|
||||||
// perform on host
|
// perform on host
|
||||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
|
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
|
||||||
t.size);
|
|
||||||
|
|
||||||
// attach managed memory to a (dummy) stream to allow host access while the
|
// attach managed memory to a (dummy) stream to allow host access while the
|
||||||
// device is running
|
// device is running
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||||
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
|
||||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||||
// call the host operation
|
// call the host operation
|
||||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// perform on device
|
// perform on device
|
||||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
|
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
|
||||||
t.size);
|
|
||||||
double one = 1.0;
|
double one = 1.0;
|
||||||
double zero = 0.0;
|
double zero = 0.0;
|
||||||
|
|
||||||
// attach managed memory to my stream
|
// attach managed memory to my stream
|
||||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
|
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
|
||||||
cudaMemAttachSingle));
|
|
||||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
|
|
||||||
cudaMemAttachSingle));
|
|
||||||
// call the device operation
|
// call the device operation
|
||||||
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
|
checkCudaErrors(cublasDgemv(
|
||||||
&one, t.data, t.size, t.vector, 1, &zero,
|
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
|
||||||
t.result, 1));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// populate a list of tasks with random sizes
|
// populate a list of tasks with random sizes
|
||||||
template <typename T>
|
template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
|
||||||
void initialise_tasks(std::vector<Task<T> > &TaskList) {
|
{
|
||||||
for (unsigned int i = 0; i < TaskList.size(); i++) {
|
for (unsigned int i = 0; i < TaskList.size(); i++) {
|
||||||
// generate random size
|
// generate random size
|
||||||
int size;
|
int size;
|
||||||
@ -236,7 +233,8 @@ void initialise_tasks(std::vector<Task<T> > &TaskList) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
// set device
|
// set device
|
||||||
cudaDeviceProp device_prop;
|
cudaDeviceProp device_prop;
|
||||||
int dev_id = findCudaDevice(argc, (const char **)argv);
|
int dev_id = findCudaDevice(argc, (const char **)argv);
|
||||||
@ -294,19 +292,17 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if ((TaskList.size() / nthreads) == 0) {
|
if ((TaskList.size() / nthreads) == 0) {
|
||||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||||
InputToThreads[i].TaskListPtr =
|
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
|
||||||
&TaskList[i * (TaskList.size() / nthreads)];
|
}
|
||||||
} else {
|
else {
|
||||||
if (i == nthreads - 1) {
|
if (i == nthreads - 1) {
|
||||||
InputToThreads[i].taskSize =
|
InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
|
||||||
(TaskList.size() / nthreads) + (TaskList.size() % nthreads);
|
|
||||||
InputToThreads[i].TaskListPtr =
|
InputToThreads[i].TaskListPtr =
|
||||||
&TaskList[i * (TaskList.size() / nthreads) +
|
&TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
|
||||||
(TaskList.size() % nthreads)];
|
}
|
||||||
} else {
|
else {
|
||||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||||
InputToThreads[i].TaskListPtr =
|
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
|
||||||
&TaskList[i * (TaskList.size() / nthreads)];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -38,19 +38,21 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// includes CUDA Runtime
|
// includes CUDA Runtime
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <helper_functions.h> // helper utility functions
|
#include <helper_functions.h> // helper utility functions
|
||||||
|
|
||||||
__global__ void increment_kernel(int *g_data, int inc_value) {
|
__global__ void increment_kernel(int *g_data, int inc_value)
|
||||||
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
g_data[idx] = g_data[idx] + inc_value;
|
g_data[idx] = g_data[idx] + inc_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool correct_output(int *data, const int n, const int x) {
|
bool correct_output(int *data, const int n, const int x)
|
||||||
|
{
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++)
|
||||||
if (data[i] != x) {
|
if (data[i] != x) {
|
||||||
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
||||||
@ -60,7 +62,8 @@ bool correct_output(int *data, const int n, const int x) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
int devID;
|
int devID;
|
||||||
cudaDeviceProp deviceProps;
|
cudaDeviceProp deviceProps;
|
||||||
|
|
||||||
@ -126,8 +129,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// print the cpu and gpu times
|
// print the cpu and gpu times
|
||||||
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
||||||
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
||||||
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
|
printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
|
||||||
counter);
|
|
||||||
|
|
||||||
// check the output for correctness
|
// check the output for correctness
|
||||||
bool bFinalResults = correct_output(a, n, value);
|
bool bFinalResults = correct_output(a, n, value);
|
||||||
|
|||||||
@ -48,15 +48,16 @@
|
|||||||
// This kernel computes a standard parallel reduction and evaluates the
|
// This kernel computes a standard parallel reduction and evaluates the
|
||||||
// time it takes to do that for each block. The timing results are stored
|
// time it takes to do that for each block. The timing results are stored
|
||||||
// in device memory.
|
// in device memory.
|
||||||
__global__ static void timedReduction(const float *input, float *output,
|
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
|
||||||
clock_t *timer) {
|
{
|
||||||
// __shared__ float shared[2 * blockDim.x];
|
// __shared__ float shared[2 * blockDim.x];
|
||||||
extern __shared__ float shared[];
|
extern __shared__ float shared[];
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
|
|
||||||
if (tid == 0) timer[bid] = clock();
|
if (tid == 0)
|
||||||
|
timer[bid] = clock();
|
||||||
|
|
||||||
// Copy input.
|
// Copy input.
|
||||||
shared[tid] = input[tid];
|
shared[tid] = input[tid];
|
||||||
@ -77,11 +78,13 @@ __global__ static void timedReduction(const float *input, float *output,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Write result.
|
// Write result.
|
||||||
if (tid == 0) output[bid] = shared[0];
|
if (tid == 0)
|
||||||
|
output[bid] = shared[0];
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid == 0) timer[bid + gridDim.x] = clock();
|
if (tid == 0)
|
||||||
|
timer[bid + gridDim.x] = clock();
|
||||||
}
|
}
|
||||||
|
|
||||||
#define NUM_BLOCKS 64
|
#define NUM_BLOCKS 64
|
||||||
@ -104,7 +107,8 @@ __global__ static void timedReduction(const float *input, float *output,
|
|||||||
// the memory. With more than 32 the speed scales linearly.
|
// the memory. With more than 32 the speed scales linearly.
|
||||||
|
|
||||||
// Start the main CUDA Sample here
|
// Start the main CUDA Sample here
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("CUDA Clock sample\n");
|
printf("CUDA Clock sample\n");
|
||||||
|
|
||||||
// This will pick the best possible CUDA capable device
|
// This will pick the best possible CUDA capable device
|
||||||
@ -121,20 +125,15 @@ int main(int argc, char **argv) {
|
|||||||
input[i] = (float)i;
|
input[i] = (float)i;
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||||
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
|
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||||
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
|
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
|
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
|
||||||
dinput, doutput, dtimer);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
|
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaFree(dinput));
|
checkCudaErrors(cudaFree(dinput));
|
||||||
checkCudaErrors(cudaFree(doutput));
|
checkCudaErrors(cudaFree(doutput));
|
||||||
|
|||||||
@ -34,12 +34,11 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <nvrtc_helper.h>
|
#include <nvrtc_helper.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
@ -71,7 +70,8 @@
|
|||||||
|
|
||||||
// Start the main CUDA Sample here
|
// Start the main CUDA Sample here
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("CUDA Clock sample\n");
|
printf("CUDA Clock sample\n");
|
||||||
|
|
||||||
typedef long clock_t;
|
typedef long clock_t;
|
||||||
@ -106,17 +106,20 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
|
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
kernel_addr, cudaGridSize.x, cudaGridSize.y,
|
cudaGridSize.x,
|
||||||
|
cudaGridSize.y,
|
||||||
cudaGridSize.z, /* grid dim */
|
cudaGridSize.z, /* grid dim */
|
||||||
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
|
cudaBlockSize.x,
|
||||||
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
|
cudaBlockSize.y,
|
||||||
|
cudaBlockSize.z, /* block dim */
|
||||||
|
sizeof(float) * 2 * NUM_THREADS,
|
||||||
|
0, /* shared mem, stream */
|
||||||
&arr[0], /* arguments */
|
&arr[0], /* arguments */
|
||||||
0));
|
0));
|
||||||
|
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||||
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
|
||||||
checkCudaErrors(cuMemFree(dinput));
|
checkCudaErrors(cuMemFree(dinput));
|
||||||
checkCudaErrors(cuMemFree(doutput));
|
checkCudaErrors(cuMemFree(doutput));
|
||||||
checkCudaErrors(cuMemFree(dtimer));
|
checkCudaErrors(cuMemFree(dtimer));
|
||||||
|
|||||||
@ -37,15 +37,16 @@
|
|||||||
// time it takes to do that for each block. The timing results are stored
|
// time it takes to do that for each block. The timing results are stored
|
||||||
// in device memory.
|
// in device memory.
|
||||||
|
|
||||||
extern "C" __global__ void timedReduction(const float *input, float *output,
|
extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
|
||||||
clock_t *timer) {
|
{
|
||||||
// __shared__ float shared[2 * blockDim.x];
|
// __shared__ float shared[2 * blockDim.x];
|
||||||
extern __shared__ float shared[];
|
extern __shared__ float shared[];
|
||||||
|
|
||||||
const int tid = threadIdx.x;
|
const int tid = threadIdx.x;
|
||||||
const int bid = blockIdx.x;
|
const int bid = blockIdx.x;
|
||||||
|
|
||||||
if (tid == 0) timer[bid] = clock();
|
if (tid == 0)
|
||||||
|
timer[bid] = clock();
|
||||||
|
|
||||||
// Copy input.
|
// Copy input.
|
||||||
shared[tid] = input[tid];
|
shared[tid] = input[tid];
|
||||||
@ -66,9 +67,11 @@ extern "C" __global__ void timedReduction(const float *input, float *output,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Write result.
|
// Write result.
|
||||||
if (tid == 0) output[bid] = shared[0];
|
if (tid == 0)
|
||||||
|
output[bid] = shared[0];
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid == 0) timer[bid + gridDim.x] = clock();
|
if (tid == 0)
|
||||||
|
timer[bid + gridDim.x] = clock();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,20 +37,24 @@
|
|||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
// a simple kernel that simply increments each array element by b
|
// a simple kernel that simply increments each array element by b
|
||||||
__global__ void kernelAddConstant(int *g_a, const int b) {
|
__global__ void kernelAddConstant(int *g_a, const int b)
|
||||||
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
g_a[idx] += b;
|
g_a[idx] += b;
|
||||||
}
|
}
|
||||||
|
|
||||||
// a predicate that checks whether each array element is set to its index plus b
|
// a predicate that checks whether each array element is set to its index plus b
|
||||||
int correctResult(int *data, const int n, const int b) {
|
int correctResult(int *data, const int n, const int b)
|
||||||
|
{
|
||||||
for (int i = 0; i < n; i++)
|
for (int i = 0; i < n; i++)
|
||||||
if (data[i] != i + b) return 0;
|
if (data[i] != i + b)
|
||||||
|
return 0;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
int num_gpus = 0; // number of CUDA GPUs
|
int num_gpus = 0; // number of CUDA GPUs
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
@ -93,7 +97,8 @@ int main(int argc, char *argv[]) {
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = 0; i < n; i++) a[i] = i;
|
for (unsigned int i = 0; i < n; i++)
|
||||||
|
a[i] = i;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////
|
||||||
// run as many CPU threads as there are CUDA devices
|
// run as many CPU threads as there are CUDA devices
|
||||||
@ -105,8 +110,7 @@ int main(int argc, char *argv[]) {
|
|||||||
// Recall that all variables declared inside an "omp parallel" scope are
|
// Recall that all variables declared inside an "omp parallel" scope are
|
||||||
// local to each CPU thread
|
// local to each CPU thread
|
||||||
//
|
//
|
||||||
omp_set_num_threads(
|
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
|
||||||
num_gpus); // create as many CPU threads as there are CUDA devices
|
|
||||||
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
|
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
|
||||||
// are CUDA devices
|
// are CUDA devices
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
@ -116,31 +120,23 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
// set and check the CUDA device for this CPU thread
|
// set and check the CUDA device for this CPU thread
|
||||||
int gpu_id = -1;
|
int gpu_id = -1;
|
||||||
checkCudaErrors(cudaSetDevice(
|
checkCudaErrors(
|
||||||
cpu_thread_id %
|
cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
|
||||||
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
|
|
||||||
checkCudaErrors(cudaGetDevice(&gpu_id));
|
checkCudaErrors(cudaGetDevice(&gpu_id));
|
||||||
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
|
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
|
||||||
num_cpu_threads, gpu_id);
|
|
||||||
|
|
||||||
int *d_a =
|
int *d_a = 0; // pointer to memory on the device associated with this CPU thread
|
||||||
0; // pointer to memory on the device associated with this CPU thread
|
int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
|
||||||
int *sub_a =
|
|
||||||
a +
|
|
||||||
cpu_thread_id * n /
|
|
||||||
num_cpu_threads; // pointer to this CPU thread's portion of data
|
|
||||||
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
|
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
|
||||||
dim3 gpu_threads(128); // 128 threads per block
|
dim3 gpu_threads(128); // 128 threads per block
|
||||||
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
|
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
|
||||||
|
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
|
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
|
||||||
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
|
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
|
|
||||||
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
|
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
|
|
||||||
checkCudaErrors(cudaFree(d_a));
|
checkCudaErrors(cudaFree(d_a));
|
||||||
}
|
}
|
||||||
printf("---------------------------\n");
|
printf("---------------------------\n");
|
||||||
@ -153,7 +149,8 @@ int main(int argc, char *argv[]) {
|
|||||||
//
|
//
|
||||||
bool bResult = correctResult(a, n, b);
|
bool bResult = correctResult(a, n, b);
|
||||||
|
|
||||||
if (a) free(a); // free CPU memory
|
if (a)
|
||||||
|
free(a); // free CPU memory
|
||||||
|
|
||||||
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -25,17 +25,18 @@
|
|||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "cuda_fp16.h"
|
|
||||||
#include "helper_cuda.h"
|
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
|
#include "cuda_fp16.h"
|
||||||
|
#include "helper_cuda.h"
|
||||||
|
|
||||||
#define NUM_OF_BLOCKS 128
|
#define NUM_OF_BLOCKS 128
|
||||||
#define NUM_OF_THREADS 128
|
#define NUM_OF_THREADS 128
|
||||||
|
|
||||||
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
|
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
|
||||||
|
{
|
||||||
if (threadIdx.x < 64)
|
if (threadIdx.x < 64)
|
||||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
|
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
@ -59,27 +60,34 @@ __forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
|
__forceinline__ __device__ void reduceInShared_native(half2 *const v)
|
||||||
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
|
{
|
||||||
|
if (threadIdx.x < 64)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
|
if (threadIdx.x < 32)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
|
if (threadIdx.x < 16)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
|
if (threadIdx.x < 8)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
|
if (threadIdx.x < 4)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
|
if (threadIdx.x < 2)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
|
if (threadIdx.x < 1)
|
||||||
|
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void scalarProductKernel_intrinsics(half2 const *const a,
|
__global__ void
|
||||||
half2 const *const b,
|
scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
|
||||||
float *const results,
|
{
|
||||||
size_t const size) {
|
|
||||||
const int stride = gridDim.x * blockDim.x;
|
const int stride = gridDim.x * blockDim.x;
|
||||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||||
|
|
||||||
@ -101,10 +109,9 @@ __global__ void scalarProductKernel_intrinsics(half2 const *const a,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void scalarProductKernel_native(half2 const *const a,
|
__global__ void
|
||||||
half2 const *const b,
|
scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
|
||||||
float *const results,
|
{
|
||||||
size_t const size) {
|
|
||||||
const int stride = gridDim.x * blockDim.x;
|
const int stride = gridDim.x * blockDim.x;
|
||||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||||
|
|
||||||
@ -126,7 +133,8 @@ __global__ void scalarProductKernel_native(half2 const *const a,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void generateInput(half2 *a, size_t size) {
|
void generateInput(half2 *a, size_t size)
|
||||||
|
{
|
||||||
for (size_t i = 0; i < size; ++i) {
|
for (size_t i = 0; i < size; ++i) {
|
||||||
half2 temp;
|
half2 temp;
|
||||||
temp.x = static_cast<float>(rand() % 4);
|
temp.x = static_cast<float>(rand() % 4);
|
||||||
@ -135,7 +143,8 @@ void generateInput(half2 *a, size_t size) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
srand((unsigned int)time(NULL));
|
srand((unsigned int)time(NULL));
|
||||||
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
|
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
|
||||||
|
|
||||||
@ -151,8 +160,7 @@ int main(int argc, char *argv[]) {
|
|||||||
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
|
||||||
|
|
||||||
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
|
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
|
||||||
printf(
|
printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
|
||||||
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
|
|
||||||
"higher.\n");
|
"higher.\n");
|
||||||
return EXIT_WAIVED;
|
return EXIT_WAIVED;
|
||||||
}
|
}
|
||||||
@ -162,23 +170,17 @@ int main(int argc, char *argv[]) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
|
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
|
||||||
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
|
checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
|
|
||||||
|
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < 2; ++i) {
|
||||||
generateInput(vec[i], size);
|
generateInput(vec[i], size);
|
||||||
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
|
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
|
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
|
||||||
devVec[0], devVec[1], devResults, size);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(results, devResults,
|
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
|
||||||
NUM_OF_BLOCKS * sizeof *results,
|
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
float result_native = 0;
|
float result_native = 0;
|
||||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||||
@ -186,12 +188,9 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
printf("Result native operators\t: %f \n", result_native);
|
printf("Result native operators\t: %f \n", result_native);
|
||||||
|
|
||||||
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
|
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
|
||||||
devVec[0], devVec[1], devResults, size);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy(results, devResults,
|
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
|
||||||
NUM_OF_BLOCKS * sizeof *results,
|
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
float result_intrinsics = 0;
|
float result_intrinsics = 0;
|
||||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||||
@ -199,9 +198,7 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
printf("Result intrinsics\t: %f \n", result_intrinsics);
|
printf("Result intrinsics\t: %f \n", result_intrinsics);
|
||||||
|
|
||||||
printf("&&&& fp16ScalarProduct %s\n",
|
printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
|
||||||
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
|
|
||||||
: "FAILED");
|
|
||||||
|
|
||||||
for (int i = 0; i < 2; ++i) {
|
for (int i = 0; i < 2; ++i) {
|
||||||
checkCudaErrors(cudaFree(devVec[i]));
|
checkCudaErrors(cudaFree(devVec[i]));
|
||||||
|
|||||||
@ -40,24 +40,23 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_profiler_api.h>
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// Helper functions and utilities to work with CUDA
|
// Helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
||||||
* wA is A's width and wB is B's width
|
* wA is A's width and wB is B's width
|
||||||
*/
|
*/
|
||||||
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
|
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
|
||||||
float *B, int wA,
|
{
|
||||||
int wB) {
|
|
||||||
// Block index
|
// Block index
|
||||||
int bx = blockIdx.x;
|
int bx = blockIdx.x;
|
||||||
int by = blockIdx.y;
|
int by = blockIdx.y;
|
||||||
@ -87,9 +86,7 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
|
|||||||
|
|
||||||
// Loop over all the sub-matrices of A and B
|
// Loop over all the sub-matrices of A and B
|
||||||
// required to compute the block sub-matrix
|
// required to compute the block sub-matrix
|
||||||
for (int a = aBegin, b = bBegin;
|
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||||
a <= aEnd;
|
|
||||||
a += aStep, b += bStep) {
|
|
||||||
// Declaration of the shared memory array As used to
|
// Declaration of the shared memory array As used to
|
||||||
// store the sub-matrix of A
|
// store the sub-matrix of A
|
||||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||||
@ -128,7 +125,8 @@ template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
|
|||||||
C[c + wB * ty + tx] = Csub;
|
C[c + wB * ty + tx] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ConstantInit(float *data, int size, float val) {
|
void ConstantInit(float *data, int size, float val)
|
||||||
|
{
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
data[i] = val;
|
data[i] = val;
|
||||||
}
|
}
|
||||||
@ -137,9 +135,8 @@ void ConstantInit(float *data, int size, float val) {
|
|||||||
/**
|
/**
|
||||||
* Run a simple test of matrix multiplication using CUDA
|
* Run a simple test of matrix multiplication using CUDA
|
||||||
*/
|
*/
|
||||||
int MatrixMultiply(int argc, char **argv,
|
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
|
||||||
int block_size, const dim3 &dimsA,
|
{
|
||||||
const dim3 &dimsB) {
|
|
||||||
// Allocate host memory for matrices A and B
|
// Allocate host memory for matrices A and B
|
||||||
unsigned int size_A = dimsA.x * dimsA.y;
|
unsigned int size_A = dimsA.x * dimsA.y;
|
||||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||||
@ -181,10 +178,8 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
|
|
||||||
// copy host memory to device
|
// copy host memory to device
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
||||||
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// Setup execution parameters
|
// Setup execution parameters
|
||||||
dim3 threads(block_size, block_size);
|
dim3 threads(block_size, block_size);
|
||||||
@ -195,11 +190,10 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
|
|
||||||
// Performs warmup operation using matrixMul CUDA kernel
|
// Performs warmup operation using matrixMul CUDA kernel
|
||||||
if (block_size == 16) {
|
if (block_size == 16) {
|
||||||
MatrixMulCUDA<16>
|
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
}
|
||||||
} else {
|
else {
|
||||||
MatrixMulCUDA<32>
|
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("done\n");
|
printf("done\n");
|
||||||
@ -213,11 +207,10 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
|
|
||||||
for (int j = 0; j < nIter; j++) {
|
for (int j = 0; j < nIter; j++) {
|
||||||
if (block_size == 16) {
|
if (block_size == 16) {
|
||||||
MatrixMulCUDA<16>
|
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
}
|
||||||
} else {
|
else {
|
||||||
MatrixMulCUDA<32>
|
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -232,19 +225,18 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
|
|
||||||
// Compute and print the performance
|
// Compute and print the performance
|
||||||
float msecPerMatrixMul = msecTotal / nIter;
|
float msecPerMatrixMul = msecTotal / nIter;
|
||||||
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
|
double flopsPerMatrixMul =
|
||||||
static_cast<double>(dimsA.y) *
|
2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
|
||||||
static_cast<double>(dimsB.x);
|
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
||||||
double gigaFlops =
|
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
||||||
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
|
||||||
printf(
|
|
||||||
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
|
||||||
" WorkgroupSize= %u threads/block\n",
|
" WorkgroupSize= %u threads/block\n",
|
||||||
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
|
gigaFlops,
|
||||||
|
msecPerMatrixMul,
|
||||||
|
flopsPerMatrixMul,
|
||||||
|
threads.x * threads.y);
|
||||||
|
|
||||||
// Copy result from device to host
|
// Copy result from device to host
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
||||||
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
printf("Checking computed result for correctness: ");
|
printf("Checking computed result for correctness: ");
|
||||||
@ -261,8 +253,7 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
double rel_err = abs_err / abs_val / dot_length;
|
double rel_err = abs_err / abs_val / dot_length;
|
||||||
|
|
||||||
if (rel_err > eps) {
|
if (rel_err > eps) {
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
|
||||||
i, h_C[i], dimsA.x * valB, eps);
|
|
||||||
correct = false;
|
correct = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -278,13 +269,13 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
checkCudaErrors(cudaFree(d_C));
|
checkCudaErrors(cudaFree(d_C));
|
||||||
checkCudaErrors(cudaEventDestroy(start));
|
checkCudaErrors(cudaEventDestroy(start));
|
||||||
checkCudaErrors(cudaEventDestroy(stop));
|
checkCudaErrors(cudaEventDestroy(stop));
|
||||||
printf(
|
printf("\nNOTE: The CUDA Samples are not meant for performance "
|
||||||
"\nNOTE: The CUDA Samples are not meant for performance "
|
|
||||||
"measurements. Results may vary when GPU Boost is enabled.\n");
|
"measurements. Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
if (correct) {
|
if (correct) {
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -293,15 +284,15 @@ int MatrixMultiply(int argc, char **argv,
|
|||||||
/**
|
/**
|
||||||
* Program main
|
* Program main
|
||||||
*/
|
*/
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
|
||||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||||
printf(" Note: Outer matrix dimensions of A & B matrices" \
|
printf(" Note: Outer matrix dimensions of A & B matrices"
|
||||||
" must be equal.\n");
|
" must be equal.\n");
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
@ -337,13 +328,11 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (dimsA.x != dimsB.y) {
|
if (dimsA.x != dimsB.y) {
|
||||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
|
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
|
||||||
dimsA.x, dimsB.y);
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
|
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
|
||||||
dimsB.x, dimsB.y);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaProfilerStart());
|
checkCudaErrors(cudaProfilerStart());
|
||||||
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||||
|
|||||||
@ -46,23 +46,23 @@
|
|||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <builtin_types.h>
|
#include <builtin_types.h>
|
||||||
#include <math.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, project, CUDA
|
// includes, project, CUDA
|
||||||
|
#include <cstring>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <helper_cuda_drvapi.h>
|
#include <helper_cuda_drvapi.h>
|
||||||
#include <helper_image.h>
|
#include <helper_image.h>
|
||||||
#include <helper_string.h>
|
#include <helper_string.h>
|
||||||
#include <helper_timer.h>
|
#include <helper_timer.h>
|
||||||
|
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "matrixMul.h"
|
#include "matrixMul.h"
|
||||||
|
|
||||||
|
|
||||||
@ -71,11 +71,9 @@
|
|||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
void randomInit(float *, int);
|
void randomInit(float *, int);
|
||||||
|
|
||||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||||
unsigned int, unsigned int);
|
|
||||||
|
|
||||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
|
||||||
int *blk_size);
|
|
||||||
|
|
||||||
#ifndef FATBIN_FILE
|
#ifndef FATBIN_FILE
|
||||||
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
||||||
@ -91,7 +89,8 @@ size_t totalGlobalMem;
|
|||||||
|
|
||||||
const char *sSDKsample = "matrixMulDrv (Driver API)";
|
const char *sSDKsample = "matrixMulDrv (Driver API)";
|
||||||
|
|
||||||
void constantInit(float *data, int size, float val) {
|
void constantInit(float *data, int size, float val)
|
||||||
|
{
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
data[i] = val;
|
data[i] = val;
|
||||||
}
|
}
|
||||||
@ -100,7 +99,8 @@ void constantInit(float *data, int size, float val) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("[ %s ]\n", sSDKsample);
|
printf("[ %s ]\n", sSDKsample);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
@ -109,7 +109,8 @@ int main(int argc, char **argv) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
// initialize CUDA
|
// initialize CUDA
|
||||||
CUfunction matrixMul = NULL;
|
CUfunction matrixMul = NULL;
|
||||||
int block_size = 0;
|
int block_size = 0;
|
||||||
@ -172,10 +173,19 @@ void runTest(int argc, char **argv) {
|
|||||||
size_t Matrix_Width_B = (size_t)WB;
|
size_t Matrix_Width_B = (size_t)WB;
|
||||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||||
// new CUDA 4.0 Driver API Kernel launch call
|
// new CUDA 4.0 Driver API Kernel launch call
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(matrixMul,
|
||||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
grid.x,
|
||||||
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
|
grid.y,
|
||||||
} else {
|
grid.z,
|
||||||
|
block.x,
|
||||||
|
block.y,
|
||||||
|
block.z,
|
||||||
|
2 * block_size * block_size * sizeof(float),
|
||||||
|
NULL,
|
||||||
|
args,
|
||||||
|
NULL));
|
||||||
|
}
|
||||||
|
else {
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||||
// Launching (advanced method)
|
// Launching (advanced method)
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
@ -198,14 +208,20 @@ void runTest(int argc, char **argv) {
|
|||||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
|
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
|
||||||
offset += sizeof(Matrix_Width_B);
|
offset += sizeof(Matrix_Width_B);
|
||||||
|
|
||||||
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
|
void *kernel_launch_config[5] = {
|
||||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
|
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
|
||||||
CU_LAUNCH_PARAM_END};
|
|
||||||
|
|
||||||
// new CUDA 4.0 Driver API Kernel launch call
|
// new CUDA 4.0 Driver API Kernel launch call
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(matrixMul,
|
||||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
grid.x,
|
||||||
2 * block_size * block_size * sizeof(float), NULL, NULL,
|
grid.y,
|
||||||
|
grid.z,
|
||||||
|
block.x,
|
||||||
|
block.y,
|
||||||
|
block.z,
|
||||||
|
2 * block_size * block_size * sizeof(float),
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
reinterpret_cast<void **>(&kernel_launch_config)));
|
reinterpret_cast<void **>(&kernel_launch_config)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -222,8 +238,7 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
|
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
|
||||||
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
|
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
|
||||||
h_C[i], WA * valB);
|
|
||||||
correct = false;
|
correct = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -244,14 +259,15 @@ void runTest(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Allocates a matrix with random float entries.
|
// Allocates a matrix with random float entries.
|
||||||
void randomInit(float *data, int size) {
|
void randomInit(float *data, int size)
|
||||||
|
{
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
data[i] = rand() / static_cast<float>(RAND_MAX);
|
data[i] = rand() / static_cast<float>(RAND_MAX);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
|
||||||
int *blk_size) {
|
{
|
||||||
CUfunction cuFunction = 0;
|
CUfunction cuFunction = 0;
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
char deviceName[100];
|
char deviceName[100];
|
||||||
@ -259,16 +275,13 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|||||||
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
||||||
|
|
||||||
// get compute capabilities and the devicename
|
// get compute capabilities and the devicename
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
||||||
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
|
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
|
||||||
printf(" Total amount of global memory: %llu bytes\n",
|
printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
|
||||||
(long long unsigned int)totalGlobalMem);
|
|
||||||
|
|
||||||
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
|
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
|
||||||
|
|
||||||
@ -278,7 +291,8 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|||||||
|
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -291,8 +305,7 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|||||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||||
|
|
||||||
// select the suitable kernel function
|
// select the suitable kernel function
|
||||||
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
|
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
|
||||||
"matrixMul_bs8_64bit"};
|
|
||||||
|
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
int block_size = 32;
|
int block_size = 32;
|
||||||
@ -302,12 +315,12 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
|||||||
|
|
||||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
||||||
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
||||||
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
|
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
|
||||||
2 * block_size * block_size * sizeof(float), 0));
|
|
||||||
if (block_size * block_size <= threadsPerBlock) {
|
if (block_size * block_size <= threadsPerBlock) {
|
||||||
printf("> %d block size selected\n", block_size);
|
printf("> %d block size selected\n", block_size);
|
||||||
break;
|
break;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
block_size /= 2;
|
block_size /= 2;
|
||||||
}
|
}
|
||||||
idx++;
|
idx++;
|
||||||
|
|||||||
@ -42,8 +42,8 @@
|
|||||||
//! wA is A's width and wB is B's width
|
//! wA is A's width and wB is B's width
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <int block_size, typename size_type>
|
template <int block_size, typename size_type>
|
||||||
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
|
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
|
||||||
size_type wB) {
|
{
|
||||||
// Block index
|
// Block index
|
||||||
size_type bx = blockIdx.x;
|
size_type bx = blockIdx.x;
|
||||||
size_type by = blockIdx.y;
|
size_type by = blockIdx.y;
|
||||||
@ -96,7 +96,8 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
|
|||||||
// of the block sub-matrix
|
// of the block sub-matrix
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
|
|
||||||
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
|
for (size_type k = 0; k < block_size; ++k)
|
||||||
|
Csub += AS(ty, k) * BS(k, tx);
|
||||||
|
|
||||||
// Synchronize to make sure that the preceding
|
// Synchronize to make sure that the preceding
|
||||||
// computation is done before loading two new
|
// computation is done before loading two new
|
||||||
@ -111,16 +112,16 @@ __device__ void matrixMul(float *C, float *A, float *B, size_type wA,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// C wrappers around our template kernel
|
// C wrappers around our template kernel
|
||||||
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||||
size_t wA, size_t wB) {
|
{
|
||||||
matrixMul<8, size_t>(C, A, B, wA, wB);
|
matrixMul<8, size_t>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||||
size_t wA, size_t wB) {
|
{
|
||||||
matrixMul<16, size_t>(C, A, B, wA, wB);
|
matrixMul<16, size_t>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||||
size_t wA, size_t wB) {
|
{
|
||||||
matrixMul<32, size_t>(C, A, B, wA, wB);
|
matrixMul<32, size_t>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -20,9 +20,10 @@
|
|||||||
// #define CUDA_INIT_D3D11
|
// #define CUDA_INIT_D3D11
|
||||||
// #define CUDA_INIT_OPENGL
|
// #define CUDA_INIT_OPENGL
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "cuda_drvapi_dynlink.h"
|
#include "cuda_drvapi_dynlink.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
tcuInit *_cuInit;
|
tcuInit *_cuInit;
|
||||||
tcuDriverGetVersion *cuDriverGetVersion;
|
tcuDriverGetVersion *cuDriverGetVersion;
|
||||||
tcuDeviceGet *cuDeviceGet;
|
tcuDeviceGet *cuDeviceGet;
|
||||||
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
{
|
{
|
||||||
*pInstance = LoadLibrary(__CudaLibName);
|
*pInstance = LoadLibrary(__CudaLibName);
|
||||||
|
|
||||||
if (*pInstance == NULL)
|
if (*pInstance == NULL) {
|
||||||
{
|
|
||||||
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
|
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
|
||||||
return CUDA_ERROR_UNKNOWN;
|
return CUDA_ERROR_UNKNOWN;
|
||||||
}
|
}
|
||||||
@ -251,24 +251,21 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
#define GET_PROC_EX(name, alias, required) \
|
#define GET_PROC_EX(name, alias, required) \
|
||||||
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
|
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
|
||||||
#name, __CudaLibName); \
|
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V2(name, alias, required) \
|
#define GET_PROC_EX_V2(name, alias, required) \
|
||||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
|
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
|
||||||
STRINGIFY(name##_v2), __CudaLibName); \
|
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V3(name, alias, required) \
|
#define GET_PROC_EX_V3(name, alias, required) \
|
||||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
|
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
|
||||||
STRINGIFY(name##_v3), __CudaLibName); \
|
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
{
|
{
|
||||||
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
|
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
|
||||||
|
|
||||||
if (*pInstance == NULL)
|
if (*pInstance == NULL) {
|
||||||
{
|
|
||||||
printf("dlopen \"%s\" failed!\n", __CudaLibName);
|
printf("dlopen \"%s\" failed!\n", __CudaLibName);
|
||||||
return CUDA_ERROR_UNKNOWN;
|
return CUDA_ERROR_UNKNOWN;
|
||||||
}
|
}
|
||||||
@ -306,24 +302,21 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
|||||||
#define GET_PROC_EX(name, alias, required) \
|
#define GET_PROC_EX(name, alias, required) \
|
||||||
alias = (t##name *)dlsym(CudaDrvLib, #name); \
|
alias = (t##name *)dlsym(CudaDrvLib, #name); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
|
||||||
#name, __CudaLibName); \
|
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V2(name, alias, required) \
|
#define GET_PROC_EX_V2(name, alias, required) \
|
||||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
|
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
|
||||||
STRINGIFY(name##_v2), __CudaLibName); \
|
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define GET_PROC_EX_V3(name, alias, required) \
|
#define GET_PROC_EX_V3(name, alias, required) \
|
||||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
|
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||||
if (alias == NULL && required) { \
|
if (alias == NULL && required) { \
|
||||||
printf("Failed to find required function \"%s\" in %s\n", \
|
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
|
||||||
STRINGIFY(name##_v3), __CudaLibName); \
|
|
||||||
return CUDA_ERROR_UNKNOWN; \
|
return CUDA_ERROR_UNKNOWN; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
// available since 2.2. if not present, version 1.0 is assumed
|
// available since 2.2. if not present, version 1.0 is assumed
|
||||||
GET_PROC_OPTIONAL(cuDriverGetVersion);
|
GET_PROC_OPTIONAL(cuDriverGetVersion);
|
||||||
|
|
||||||
if (cuDriverGetVersion)
|
if (cuDriverGetVersion) {
|
||||||
{
|
|
||||||
CHECKED_CALL(cuDriverGetVersion(&driverVer));
|
CHECKED_CALL(cuDriverGetVersion(&driverVer));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuStreamDestroy);
|
GET_PROC(cuStreamDestroy);
|
||||||
|
|
||||||
// These are CUDA 5.0 new functions
|
// These are CUDA 5.0 new functions
|
||||||
if (driverVer >= 5000)
|
if (driverVer >= 5000) {
|
||||||
{
|
|
||||||
GET_PROC(cuMipmappedArrayCreate);
|
GET_PROC(cuMipmappedArrayCreate);
|
||||||
GET_PROC(cuMipmappedArrayDestroy);
|
GET_PROC(cuMipmappedArrayDestroy);
|
||||||
GET_PROC(cuMipmappedArrayGetLevel);
|
GET_PROC(cuMipmappedArrayGetLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
// These are CUDA 4.2 new functions
|
// These are CUDA 4.2 new functions
|
||||||
if (driverVer >= 4020)
|
if (driverVer >= 4020) {
|
||||||
{
|
|
||||||
GET_PROC(cuFuncSetSharedMemConfig);
|
GET_PROC(cuFuncSetSharedMemConfig);
|
||||||
GET_PROC(cuCtxGetSharedMemConfig);
|
GET_PROC(cuCtxGetSharedMemConfig);
|
||||||
GET_PROC(cuCtxSetSharedMemConfig);
|
GET_PROC(cuCtxSetSharedMemConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
// These are CUDA 4.1 new functions
|
// These are CUDA 4.1 new functions
|
||||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
|
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
|
||||||
{
|
|
||||||
GET_PROC(cuDeviceGetByPCIBusId);
|
GET_PROC(cuDeviceGetByPCIBusId);
|
||||||
GET_PROC(cuDeviceGetPCIBusId);
|
GET_PROC(cuDeviceGetPCIBusId);
|
||||||
GET_PROC(cuIpcGetEventHandle);
|
GET_PROC(cuIpcGetEventHandle);
|
||||||
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// These could be _v2 interfaces
|
// These could be _v2 interfaces
|
||||||
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
|
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuCtxDestroy);
|
GET_PROC_V2(cuCtxDestroy);
|
||||||
GET_PROC_V2(cuCtxPopCurrent);
|
GET_PROC_V2(cuCtxPopCurrent);
|
||||||
GET_PROC_V2(cuCtxPushCurrent);
|
GET_PROC_V2(cuCtxPushCurrent);
|
||||||
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC_V2(cuEventDestroy);
|
GET_PROC_V2(cuEventDestroy);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
|
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuDeviceTotalMem);
|
GET_PROC_V2(cuDeviceTotalMem);
|
||||||
GET_PROC_V2(cuCtxCreate);
|
GET_PROC_V2(cuCtxCreate);
|
||||||
GET_PROC_V2(cuModuleGetGlobal);
|
GET_PROC_V2(cuModuleGetGlobal);
|
||||||
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC_V2(cuTexRefSetAddress);
|
GET_PROC_V2(cuTexRefSetAddress);
|
||||||
GET_PROC_V2(cuTexRefGetAddress);
|
GET_PROC_V2(cuTexRefGetAddress);
|
||||||
|
|
||||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
|
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
|
||||||
{
|
|
||||||
GET_PROC_V3(cuTexRefSetAddress2D);
|
GET_PROC_V3(cuTexRefSetAddress2D);
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuTexRefSetAddress2D);
|
GET_PROC_V2(cuTexRefSetAddress2D);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
// versions earlier than 3020
|
// versions earlier than 3020
|
||||||
GET_PROC(cuDeviceTotalMem);
|
GET_PROC(cuDeviceTotalMem);
|
||||||
GET_PROC(cuCtxCreate);
|
GET_PROC(cuCtxCreate);
|
||||||
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// The following functions are specific to CUDA versions
|
// The following functions are specific to CUDA versions
|
||||||
if (driverVer >= 4000)
|
if (driverVer >= 4000) {
|
||||||
{
|
|
||||||
GET_PROC(cuCtxSetCurrent);
|
GET_PROC(cuCtxSetCurrent);
|
||||||
GET_PROC(cuCtxGetCurrent);
|
GET_PROC(cuCtxGetCurrent);
|
||||||
GET_PROC(cuMemHostRegister);
|
GET_PROC(cuMemHostRegister);
|
||||||
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuProfilerStop);
|
GET_PROC(cuProfilerStop);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 3010)
|
if (driverVer >= 3010) {
|
||||||
{
|
|
||||||
GET_PROC(cuModuleGetSurfRef);
|
GET_PROC(cuModuleGetSurfRef);
|
||||||
GET_PROC(cuSurfRefSetArray);
|
GET_PROC(cuSurfRefSetArray);
|
||||||
GET_PROC(cuSurfRefGetArray);
|
GET_PROC(cuSurfRefGetArray);
|
||||||
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuCtxGetLimit);
|
GET_PROC(cuCtxGetLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 3000)
|
if (driverVer >= 3000) {
|
||||||
{
|
|
||||||
GET_PROC(cuMemcpyDtoDAsync);
|
GET_PROC(cuMemcpyDtoDAsync);
|
||||||
GET_PROC(cuFuncSetCacheConfig);
|
GET_PROC(cuFuncSetCacheConfig);
|
||||||
#ifdef CUDA_INIT_D3D11
|
#ifdef CUDA_INIT_D3D11
|
||||||
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuGraphicsUnregisterResource);
|
GET_PROC(cuGraphicsUnregisterResource);
|
||||||
GET_PROC(cuGraphicsSubResourceGetMappedArray);
|
GET_PROC(cuGraphicsSubResourceGetMappedArray);
|
||||||
|
|
||||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
|
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
|
||||||
{
|
|
||||||
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
|
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
GET_PROC(cuGraphicsResourceGetMappedPointer);
|
GET_PROC(cuGraphicsResourceGetMappedPointer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
GET_PROC(cuGetExportTable);
|
GET_PROC(cuGetExportTable);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 2030)
|
if (driverVer >= 2030) {
|
||||||
{
|
|
||||||
GET_PROC(cuMemHostGetFlags);
|
GET_PROC(cuMemHostGetFlags);
|
||||||
#ifdef CUDA_INIT_D3D10
|
#ifdef CUDA_INIT_D3D10
|
||||||
GET_PROC(cuD3D10GetDevice);
|
GET_PROC(cuD3D10GetDevice);
|
||||||
@ -624,8 +602,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (driverVer >= 2010)
|
if (driverVer >= 2010) {
|
||||||
{
|
|
||||||
GET_PROC(cuModuleLoadDataEx);
|
GET_PROC(cuModuleLoadDataEx);
|
||||||
GET_PROC(cuModuleLoadFatBinary);
|
GET_PROC(cuModuleLoadFatBinary);
|
||||||
#ifdef CUDA_INIT_OPENGL
|
#ifdef CUDA_INIT_OPENGL
|
||||||
|
|||||||
@ -43,7 +43,8 @@
|
|||||||
#define CUDA_VERSION 3020 /* 3.2 */
|
#define CUDA_VERSION 3020 /* 3.2 */
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -81,8 +82,7 @@ typedef struct CUuuid_st /**< CUDA definition o
|
|||||||
/**
|
/**
|
||||||
* Context creation flags
|
* Context creation flags
|
||||||
*/
|
*/
|
||||||
typedef enum CUctx_flags_enum
|
typedef enum CUctx_flags_enum {
|
||||||
{
|
|
||||||
CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
|
CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
|
||||||
CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
|
CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
|
||||||
CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
|
CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
|
||||||
@ -103,8 +103,7 @@ typedef enum CUctx_flags_enum
|
|||||||
/**
|
/**
|
||||||
* Event creation flags
|
* Event creation flags
|
||||||
*/
|
*/
|
||||||
typedef enum CUevent_flags_enum
|
typedef enum CUevent_flags_enum {
|
||||||
{
|
|
||||||
CU_EVENT_DEFAULT = 0, /**< Default event flag */
|
CU_EVENT_DEFAULT = 0, /**< Default event flag */
|
||||||
CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */
|
CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */
|
||||||
CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */
|
CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */
|
||||||
@ -113,8 +112,7 @@ typedef enum CUevent_flags_enum
|
|||||||
/**
|
/**
|
||||||
* Array formats
|
* Array formats
|
||||||
*/
|
*/
|
||||||
typedef enum CUarray_format_enum
|
typedef enum CUarray_format_enum {
|
||||||
{
|
|
||||||
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
|
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
|
||||||
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
|
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
|
||||||
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
|
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
|
||||||
@ -128,8 +126,7 @@ typedef enum CUarray_format_enum
|
|||||||
/**
|
/**
|
||||||
* Texture reference addressing modes
|
* Texture reference addressing modes
|
||||||
*/
|
*/
|
||||||
typedef enum CUaddress_mode_enum
|
typedef enum CUaddress_mode_enum {
|
||||||
{
|
|
||||||
CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
|
CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
|
||||||
CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
|
CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
|
||||||
CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
|
CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
|
||||||
@ -139,8 +136,7 @@ typedef enum CUaddress_mode_enum
|
|||||||
/**
|
/**
|
||||||
* Texture reference filtering modes
|
* Texture reference filtering modes
|
||||||
*/
|
*/
|
||||||
typedef enum CUfilter_mode_enum
|
typedef enum CUfilter_mode_enum {
|
||||||
{
|
|
||||||
CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
|
CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
|
||||||
CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
|
CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
|
||||||
} CUfilter_mode;
|
} CUfilter_mode;
|
||||||
@ -148,8 +144,7 @@ typedef enum CUfilter_mode_enum
|
|||||||
/**
|
/**
|
||||||
* Device properties
|
* Device properties
|
||||||
*/
|
*/
|
||||||
typedef enum CUdevice_attribute_enum
|
typedef enum CUdevice_attribute_enum {
|
||||||
{
|
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
|
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
|
||||||
@ -158,12 +153,15 @@ typedef enum CUdevice_attribute_enum
|
|||||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
|
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
|
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
|
||||||
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
|
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK =
|
||||||
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
|
8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
|
||||||
|
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY =
|
||||||
|
9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
|
||||||
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
|
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
|
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
|
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
|
||||||
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
|
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK =
|
||||||
|
12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
|
||||||
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
|
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
|
||||||
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
|
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
|
||||||
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
|
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
|
||||||
@ -190,7 +188,8 @@ typedef enum CUdevice_attribute_enum
|
|||||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */
|
||||||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76 /**< Minor compute capability version number */
|
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76 /**< Minor compute capability version number */
|
||||||
#if __CUDA_API_VERSION >= 4000
|
#if __CUDA_API_VERSION >= 4000
|
||||||
, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
|
,
|
||||||
|
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
|
||||||
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
|
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
|
||||||
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
|
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
|
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
|
||||||
@ -221,8 +220,7 @@ typedef struct CUdevprop_st
|
|||||||
/**
|
/**
|
||||||
* Function properties
|
* Function properties
|
||||||
*/
|
*/
|
||||||
typedef enum CUfunction_attribute_enum
|
typedef enum CUfunction_attribute_enum {
|
||||||
{
|
|
||||||
/**
|
/**
|
||||||
* The maximum number of threads per block, beyond which a launch of the
|
* The maximum number of threads per block, beyond which a launch of the
|
||||||
* function would fail. This number depends on both the function and the
|
* function would fail. This number depends on both the function and the
|
||||||
@ -277,8 +275,7 @@ typedef enum CUfunction_attribute_enum
|
|||||||
/**
|
/**
|
||||||
* Function cache configurations
|
* Function cache configurations
|
||||||
*/
|
*/
|
||||||
typedef enum CUfunc_cache_enum
|
typedef enum CUfunc_cache_enum {
|
||||||
{
|
|
||||||
CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
|
CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
|
||||||
CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
|
CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
|
||||||
CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */
|
CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */
|
||||||
@ -287,8 +284,7 @@ typedef enum CUfunc_cache_enum
|
|||||||
/**
|
/**
|
||||||
* Shared memory configurations
|
* Shared memory configurations
|
||||||
*/
|
*/
|
||||||
typedef enum CUsharedconfig_enum
|
typedef enum CUsharedconfig_enum {
|
||||||
{
|
|
||||||
CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */
|
CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */
|
||||||
CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */
|
CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */
|
||||||
CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */
|
CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */
|
||||||
@ -297,33 +293,34 @@ typedef enum CUsharedconfig_enum
|
|||||||
/**
|
/**
|
||||||
* Memory types
|
* Memory types
|
||||||
*/
|
*/
|
||||||
typedef enum CUmemorytype_enum
|
typedef enum CUmemorytype_enum {
|
||||||
{
|
|
||||||
CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
|
CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
|
||||||
CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
|
CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
|
||||||
CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */
|
CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */
|
||||||
#if __CUDA_API_VERSION >= 4000
|
#if __CUDA_API_VERSION >= 4000
|
||||||
, CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
|
,
|
||||||
|
CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
|
||||||
#endif
|
#endif
|
||||||
} CUmemorytype;
|
} CUmemorytype;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute Modes
|
* Compute Modes
|
||||||
*/
|
*/
|
||||||
typedef enum CUcomputemode_enum
|
typedef enum CUcomputemode_enum {
|
||||||
{
|
|
||||||
CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
|
CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
|
||||||
CU_COMPUTEMODE_PROHIBITED = 2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
|
CU_COMPUTEMODE_PROHIBITED =
|
||||||
|
2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
|
||||||
#if __CUDA_API_VERSION >= 4000
|
#if __CUDA_API_VERSION >= 4000
|
||||||
, CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
|
,
|
||||||
|
CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single
|
||||||
|
process can be present on this device at a time) */
|
||||||
#endif
|
#endif
|
||||||
} CUcomputemode;
|
} CUcomputemode;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Online compiler options
|
* Online compiler options
|
||||||
*/
|
*/
|
||||||
typedef enum CUjit_option_enum
|
typedef enum CUjit_option_enum {
|
||||||
{
|
|
||||||
/**
|
/**
|
||||||
* Max number of registers that a thread may use.\n
|
* Max number of registers that a thread may use.\n
|
||||||
* Option type: unsigned int
|
* Option type: unsigned int
|
||||||
@ -414,8 +411,7 @@ typedef enum CUjit_option_enum
|
|||||||
/**
|
/**
|
||||||
* Online compilation targets
|
* Online compilation targets
|
||||||
*/
|
*/
|
||||||
typedef enum CUjit_target_enum
|
typedef enum CUjit_target_enum {
|
||||||
{
|
|
||||||
CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
|
CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */
|
||||||
CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
|
CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */
|
||||||
CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
|
CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */
|
||||||
@ -434,8 +430,7 @@ typedef enum CUjit_target_enum
|
|||||||
/**
|
/**
|
||||||
* Cubin matching fallback strategies
|
* Cubin matching fallback strategies
|
||||||
*/
|
*/
|
||||||
typedef enum CUjit_fallback_enum
|
typedef enum CUjit_fallback_enum {
|
||||||
{
|
|
||||||
CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
|
CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
|
||||||
CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */
|
CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */
|
||||||
} CUjit_fallback;
|
} CUjit_fallback;
|
||||||
@ -443,8 +438,7 @@ typedef enum CUjit_fallback_enum
|
|||||||
/**
|
/**
|
||||||
* Flags to register a graphics resource
|
* Flags to register a graphics resource
|
||||||
*/
|
*/
|
||||||
typedef enum CUgraphicsRegisterFlags_enum
|
typedef enum CUgraphicsRegisterFlags_enum {
|
||||||
{
|
|
||||||
CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
|
CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
|
||||||
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
|
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
|
||||||
CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
|
CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
|
||||||
@ -454,8 +448,7 @@ typedef enum CUgraphicsRegisterFlags_enum
|
|||||||
/**
|
/**
|
||||||
* Flags for mapping and unmapping interop resources
|
* Flags for mapping and unmapping interop resources
|
||||||
*/
|
*/
|
||||||
typedef enum CUgraphicsMapResourceFlags_enum
|
typedef enum CUgraphicsMapResourceFlags_enum {
|
||||||
{
|
|
||||||
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
|
||||||
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
|
||||||
CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
|
||||||
@ -464,8 +457,7 @@ typedef enum CUgraphicsMapResourceFlags_enum
|
|||||||
/**
|
/**
|
||||||
* Array indices for cube faces
|
* Array indices for cube faces
|
||||||
*/
|
*/
|
||||||
typedef enum CUarray_cubemap_face_enum
|
typedef enum CUarray_cubemap_face_enum {
|
||||||
{
|
|
||||||
CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
|
CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
|
||||||
CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
|
CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
|
||||||
CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
|
CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
|
||||||
@ -477,8 +469,7 @@ typedef enum CUarray_cubemap_face_enum
|
|||||||
/**
|
/**
|
||||||
* Limits
|
* Limits
|
||||||
*/
|
*/
|
||||||
typedef enum CUlimit_enum
|
typedef enum CUlimit_enum {
|
||||||
{
|
|
||||||
CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
|
CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
|
||||||
CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
|
CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
|
||||||
CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
|
CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
|
||||||
@ -487,8 +478,7 @@ typedef enum CUlimit_enum
|
|||||||
/**
|
/**
|
||||||
* Resource types
|
* Resource types
|
||||||
*/
|
*/
|
||||||
typedef enum CUresourcetype_enum
|
typedef enum CUresourcetype_enum {
|
||||||
{
|
|
||||||
CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */
|
CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */
|
||||||
CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
|
CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
|
||||||
CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
|
CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
|
||||||
@ -498,8 +488,7 @@ typedef enum CUresourcetype_enum
|
|||||||
/**
|
/**
|
||||||
* Error codes
|
* Error codes
|
||||||
*/
|
*/
|
||||||
typedef enum cudaError_enum
|
typedef enum cudaError_enum {
|
||||||
{
|
|
||||||
/**
|
/**
|
||||||
* The API call returned with no errors. In the case of query calls, this
|
* The API call returned with no errors. In the case of query calls, this
|
||||||
* can also mean that the operation being queried is complete (see
|
* can also mean that the operation being queried is complete (see
|
||||||
@ -1064,8 +1053,7 @@ typedef struct CUDA_TEXTURE_DESC_st
|
|||||||
/**
|
/**
|
||||||
* Resource view format
|
* Resource view format
|
||||||
*/
|
*/
|
||||||
typedef enum CUresourceViewFormat_enum
|
typedef enum CUresourceViewFormat_enum {
|
||||||
{
|
|
||||||
CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */
|
CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */
|
||||||
CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */
|
CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */
|
||||||
CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */
|
CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */
|
||||||
@ -1130,7 +1118,6 @@ typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If set, the CUDA array is a collection of layers, where each layer is either a 1D
|
* If set, the CUDA array is a collection of layers, where each layer is either a 1D
|
||||||
* or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
|
* or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
|
||||||
@ -1420,7 +1407,11 @@ typedef CUresult CUDAAPI tcuCtxSynchronize(void);
|
|||||||
|
|
||||||
typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
|
typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
|
||||||
typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
|
typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
|
||||||
typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module,
|
||||||
|
const void *image,
|
||||||
|
unsigned int numOptions,
|
||||||
|
CUjit_option *options,
|
||||||
|
void **optionValues);
|
||||||
typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
|
typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
|
||||||
typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
|
typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
|
||||||
typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
|
typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
|
||||||
@ -1449,8 +1440,7 @@ typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
|
|||||||
size_t Height,
|
size_t Height,
|
||||||
// size of biggest r/w to be performed by kernels on this memory
|
// size of biggest r/w to be performed by kernels on this memory
|
||||||
// 4, 8 or 16 bytes
|
// 4, 8 or 16 bytes
|
||||||
unsigned int ElementSizeBytes
|
unsigned int ElementSizeBytes);
|
||||||
);
|
|
||||||
#else
|
#else
|
||||||
typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
|
typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
|
||||||
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
|
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
|
||||||
@ -1461,8 +1451,7 @@ typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
|
|||||||
unsigned int Height,
|
unsigned int Height,
|
||||||
// size of biggest r/w to be performed by kernels on this memory
|
// size of biggest r/w to be performed by kernels on this memory
|
||||||
// 4, 8 or 16 bytes
|
// 4, 8 or 16 bytes
|
||||||
unsigned int ElementSizeBytes
|
unsigned int ElementSizeBytes);
|
||||||
);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
|
typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
|
||||||
@ -1495,9 +1484,9 @@ typedef struct CUipcMemHandle_st
|
|||||||
char reserved[CU_IPC_HANDLE_SIZE];
|
char reserved[CU_IPC_HANDLE_SIZE];
|
||||||
} CUipcMemHandle;
|
} CUipcMemHandle;
|
||||||
|
|
||||||
typedef enum CUipcMem_flags_enum
|
typedef enum CUipcMem_flags_enum {
|
||||||
{
|
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS =
|
||||||
CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
|
0x1 /**< Automatically enable peer access between remote devices as needed */
|
||||||
} CUipcMem_flags;
|
} CUipcMem_flags;
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
|
typedef CUresult CUDAAPI tcuDeviceGetByPCIBusId(CUdevice *dev, char *pciBusId);
|
||||||
@ -1510,9 +1499,14 @@ typedef CUresult CUDAAPI tcuIpcCloseMemHandle(CUdeviceptr dptr);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
|
typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
|
||||||
typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
|
typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);
|
||||||
|
;
|
||||||
typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
|
||||||
typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice,
|
||||||
|
CUcontext dstContext,
|
||||||
|
CUdeviceptr srcDevice,
|
||||||
|
CUcontext srcContext,
|
||||||
|
size_t ByteCount);
|
||||||
|
|
||||||
/************************************
|
/************************************
|
||||||
**
|
**
|
||||||
@ -1541,7 +1535,8 @@ typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, cons
|
|||||||
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
||||||
|
|
||||||
// array <-> array memory
|
// array <-> array memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
typedef CUresult CUDAAPI
|
||||||
|
tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
|
||||||
#else
|
#else
|
||||||
// system <-> device memory
|
// system <-> device memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
|
||||||
@ -1551,15 +1546,28 @@ typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, un
|
|||||||
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
|
||||||
|
|
||||||
// device <-> array memory
|
// device <-> array memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray,
|
||||||
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
unsigned int dstOffset,
|
||||||
|
CUdeviceptr srcDevice,
|
||||||
|
unsigned int ByteCount);
|
||||||
|
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice,
|
||||||
|
CUarray srcArray,
|
||||||
|
unsigned int srcOffset,
|
||||||
|
unsigned int ByteCount);
|
||||||
|
|
||||||
// system <-> array memory
|
// system <-> array memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray,
|
||||||
|
unsigned int dstOffset,
|
||||||
|
const void *srcHost,
|
||||||
|
unsigned int ByteCount);
|
||||||
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
||||||
|
|
||||||
// array <-> array memory
|
// array <-> array memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
|
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray,
|
||||||
|
unsigned int dstOffset,
|
||||||
|
CUarray srcArray,
|
||||||
|
unsigned int srcOffset,
|
||||||
|
unsigned int ByteCount);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 2D memcpy
|
// 2D memcpy
|
||||||
@ -1586,36 +1594,51 @@ typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
|
|||||||
#if __CUDA_API_VERSION >= 3020
|
#if __CUDA_API_VERSION >= 3020
|
||||||
// system <-> device memory
|
// system <-> device memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
|
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
|
||||||
const void *srcHost, size_t ByteCount, CUstream hStream);
|
const void *srcHost,
|
||||||
|
size_t ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
|
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
|
||||||
CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
|
CUdeviceptr srcDevice,
|
||||||
|
size_t ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
|
|
||||||
// device <-> device memory
|
// device <-> device memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
|
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
|
||||||
CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
|
CUdeviceptr srcDevice,
|
||||||
|
size_t ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
|
|
||||||
// system <-> array memory
|
// system <-> array memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
|
typedef CUresult CUDAAPI
|
||||||
const void *srcHost, size_t ByteCount, CUstream hStream);
|
tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
|
||||||
typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
|
typedef CUresult CUDAAPI
|
||||||
size_t ByteCount, CUstream hStream);
|
tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// system <-> device memory
|
// system <-> device memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
|
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
|
||||||
const void *srcHost, unsigned int ByteCount, CUstream hStream);
|
const void *srcHost,
|
||||||
|
unsigned int ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
|
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
|
||||||
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
|
CUdeviceptr srcDevice,
|
||||||
|
unsigned int ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
|
|
||||||
// device <-> device memory
|
// device <-> device memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
|
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
|
||||||
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
|
CUdeviceptr srcDevice,
|
||||||
|
unsigned int ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
|
|
||||||
// system <-> array memory
|
// system <-> array memory
|
||||||
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
|
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray,
|
||||||
const void *srcHost, unsigned int ByteCount, CUstream hStream);
|
unsigned int dstOffset,
|
||||||
typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
|
const void *srcHost,
|
||||||
unsigned int ByteCount, CUstream hStream);
|
unsigned int ByteCount,
|
||||||
|
CUstream hStream);
|
||||||
|
typedef CUresult CUDAAPI
|
||||||
|
tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// 2D memcpy
|
// 2D memcpy
|
||||||
@ -1634,13 +1657,22 @@ typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
|
|||||||
typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
|
typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
|
||||||
|
|
||||||
#if __CUDA_API_VERSION >= 3020
|
#if __CUDA_API_VERSION >= 3020
|
||||||
typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
|
typedef CUresult CUDAAPI
|
||||||
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
|
tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
|
||||||
typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
|
typedef CUresult CUDAAPI
|
||||||
|
tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
|
||||||
|
typedef CUresult CUDAAPI
|
||||||
|
tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
|
||||||
#else
|
#else
|
||||||
typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
|
typedef CUresult CUDAAPI
|
||||||
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
|
tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
|
||||||
typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
|
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice,
|
||||||
|
unsigned int dstPitch,
|
||||||
|
unsigned short us,
|
||||||
|
unsigned int Width,
|
||||||
|
unsigned int Height);
|
||||||
|
typedef CUresult CUDAAPI
|
||||||
|
tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/************************************
|
/************************************
|
||||||
@ -1657,10 +1689,16 @@ typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache co
|
|||||||
typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
|
typedef CUresult CUDAAPI tcuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
|
typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
|
||||||
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
unsigned int gridDimX,
|
||||||
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
unsigned int gridDimY,
|
||||||
|
unsigned int gridDimZ,
|
||||||
|
unsigned int blockDimX,
|
||||||
|
unsigned int blockDimY,
|
||||||
|
unsigned int blockDimZ,
|
||||||
unsigned int sharedMemBytes,
|
unsigned int sharedMemBytes,
|
||||||
CUstream hStream, void **kernelParams, void **extra);
|
CUstream hStream,
|
||||||
|
void **kernelParams,
|
||||||
|
void **extra);
|
||||||
|
|
||||||
/************************************
|
/************************************
|
||||||
**
|
**
|
||||||
@ -1676,8 +1714,12 @@ typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_
|
|||||||
typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
|
typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
|
||||||
|
|
||||||
#if __CUDA_API_VERSION >= 5000
|
#if __CUDA_API_VERSION >= 5000
|
||||||
typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
|
typedef CUresult CUDAAPI tcuMipmappedArrayCreate(CUmipmappedArray *pHandle,
|
||||||
typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
|
const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
|
||||||
|
unsigned int numMipmapLevels);
|
||||||
|
typedef CUresult CUDAAPI tcuMipmappedArrayGetLevel(CUarray *pLevelArray,
|
||||||
|
CUmipmappedArray hMipmappedArray,
|
||||||
|
unsigned int level);
|
||||||
typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
|
typedef CUresult CUDAAPI tcuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
@ -1694,10 +1736,19 @@ typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, un
|
|||||||
|
|
||||||
#if __CUDA_API_VERSION >= 3020
|
#if __CUDA_API_VERSION >= 3020
|
||||||
typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
|
typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
|
||||||
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
|
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef,
|
||||||
|
const CUDA_ARRAY_DESCRIPTOR *desc,
|
||||||
|
CUdeviceptr dptr,
|
||||||
|
size_t Pitch);
|
||||||
#else
|
#else
|
||||||
typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
|
typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset,
|
||||||
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
|
CUtexref hTexRef,
|
||||||
|
CUdeviceptr dptr,
|
||||||
|
unsigned int bytes);
|
||||||
|
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef,
|
||||||
|
const CUDA_ARRAY_DESCRIPTOR *desc,
|
||||||
|
CUdeviceptr dptr,
|
||||||
|
unsigned int Pitch);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
|
typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
|
||||||
@ -1763,7 +1814,10 @@ typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStar
|
|||||||
***********************************/
|
***********************************/
|
||||||
typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
|
typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
|
||||||
typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
|
typedef CUresult CUDAAPI tcuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
|
||||||
typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
|
typedef CUresult CUDAAPI tcuStreamAddCallback(CUstream hStream,
|
||||||
|
CUstreamCallback callback,
|
||||||
|
void *userData,
|
||||||
|
unsigned int flags);
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
|
typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
|
||||||
typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
|
typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
|
||||||
@ -1775,17 +1829,28 @@ typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
|
|||||||
**
|
**
|
||||||
***********************************/
|
***********************************/
|
||||||
typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
|
typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
|
||||||
typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
|
typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray,
|
||||||
|
CUgraphicsResource resource,
|
||||||
|
unsigned int arrayIndex,
|
||||||
|
unsigned int mipLevel);
|
||||||
|
|
||||||
#if __CUDA_API_VERSION >= 3020
|
#if __CUDA_API_VERSION >= 3020
|
||||||
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
|
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr,
|
||||||
|
size_t *pSize,
|
||||||
|
CUgraphicsResource resource);
|
||||||
#else
|
#else
|
||||||
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
|
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr,
|
||||||
|
unsigned int *pSize,
|
||||||
|
CUgraphicsResource resource);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
|
typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
|
||||||
typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count,
|
||||||
typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
|
CUgraphicsResource *resources,
|
||||||
|
CUstream hStream);
|
||||||
|
typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count,
|
||||||
|
CUgraphicsResource *resources,
|
||||||
|
CUstream hStream);
|
||||||
|
|
||||||
/************************************
|
/************************************
|
||||||
**
|
**
|
||||||
|
|||||||
@ -14,21 +14,17 @@
|
|||||||
#ifndef HELPER_CUDA_DRVAPI_H
|
#ifndef HELPER_CUDA_DRVAPI_H
|
||||||
#define HELPER_CUDA_DRVAPI_H
|
#define HELPER_CUDA_DRVAPI_H
|
||||||
|
|
||||||
|
#include <helper_string.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include <helper_string.h>
|
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(a, b) (a > b ? a : b)
|
#define MAX(a, b) (a > b ? a : b)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef HELPER_CUDA_DRVAPI_H
|
#ifndef HELPER_CUDA_DRVAPI_H
|
||||||
inline int ftoi(float value) {
|
inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
|
||||||
return (value >= 0 ? static_cast<int>(value + 0.5)
|
|
||||||
: static_cast<int>(value - 0.5));
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef EXIT_WAIVED
|
#ifndef EXIT_WAIVED
|
||||||
@ -47,39 +43,43 @@ inline int ftoi(float value) {
|
|||||||
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
|
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
|
||||||
|
|
||||||
// These are the inline versions for all of the SDK helper functions
|
// These are the inline versions for all of the SDK helper functions
|
||||||
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
|
||||||
|
{
|
||||||
if (CUDA_SUCCESS != err) {
|
if (CUDA_SUCCESS != err) {
|
||||||
const char *errorStr = NULL;
|
const char *errorStr = NULL;
|
||||||
cuGetErrorString(err, &errorStr);
|
cuGetErrorString(err, &errorStr);
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
|
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
|
||||||
"line %i.\n",
|
"line %i.\n",
|
||||||
err, errorStr, file, line);
|
err,
|
||||||
|
errorStr,
|
||||||
|
file,
|
||||||
|
line);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// This function wraps the CUDA Driver API into a template function
|
// This function wraps the CUDA Driver API into a template function
|
||||||
template <class T>
|
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
|
{
|
||||||
int device) {
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
|
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Beginning of GPU Architecture definitions
|
// Beginning of GPU Architecture definitions
|
||||||
inline int _ConvertSMVer2CoresDRV(int major, int minor) {
|
inline int _ConvertSMVer2CoresDRV(int major, int minor)
|
||||||
|
{
|
||||||
// Defines for GPU Architecture types (using the SM version to determine the #
|
// Defines for GPU Architecture types (using the SM version to determine the #
|
||||||
// of cores per SM
|
// of cores per SM
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
|
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
|
||||||
// minor version
|
// minor version
|
||||||
int Cores;
|
int Cores;
|
||||||
} sSMtoCores;
|
} sSMtoCores;
|
||||||
|
|
||||||
sSMtoCores nGpuArchCoresPerSM[] = {
|
sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
|
||||||
{0x30, 192},
|
|
||||||
{0x32, 192},
|
{0x32, 192},
|
||||||
{0x35, 192},
|
{0x35, 192},
|
||||||
{0x37, 192},
|
{0x37, 192},
|
||||||
@ -110,16 +110,18 @@ inline int _ConvertSMVer2CoresDRV(int major, int minor) {
|
|||||||
|
|
||||||
// If we don't find the values, we default use the previous one to run
|
// If we don't find the values, we default use the previous one to run
|
||||||
// properly
|
// properly
|
||||||
printf(
|
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
||||||
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
major,
|
||||||
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
|
minor,
|
||||||
|
nGpuArchCoresPerSM[index - 1].Cores);
|
||||||
return nGpuArchCoresPerSM[index - 1].Cores;
|
return nGpuArchCoresPerSM[index - 1].Cores;
|
||||||
}
|
}
|
||||||
// end of GPU Architecture definitions
|
// end of GPU Architecture definitions
|
||||||
|
|
||||||
#ifdef __cuda_cuda_h__
|
#ifdef __cuda_cuda_h__
|
||||||
// General GPU Device CUDA Initialization
|
// General GPU Device CUDA Initialization
|
||||||
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
|
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
|
||||||
|
{
|
||||||
int cuDevice = 0;
|
int cuDevice = 0;
|
||||||
int deviceCount = 0;
|
int deviceCount = 0;
|
||||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||||
@ -140,11 +142,8 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
|
|||||||
|
|
||||||
if (dev > deviceCount - 1) {
|
if (dev > deviceCount - 1) {
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
|
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
|
||||||
deviceCount);
|
fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
|
||||||
fprintf(stderr,
|
|
||||||
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
|
|
||||||
dev);
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
return -dev;
|
return -dev;
|
||||||
}
|
}
|
||||||
@ -171,7 +170,8 @@ inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// This function returns the best GPU based on performance
|
// This function returns the best GPU based on performance
|
||||||
inline int gpuGetMaxGflopsDeviceIdDRV() {
|
inline int gpuGetMaxGflopsDeviceIdDRV()
|
||||||
|
{
|
||||||
CUdevice current_device = 0;
|
CUdevice current_device = 0;
|
||||||
CUdevice max_perf_device = 0;
|
CUdevice max_perf_device = 0;
|
||||||
int device_count = 0;
|
int device_count = 0;
|
||||||
@ -187,8 +187,7 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
|||||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||||
|
|
||||||
if (device_count == 0) {
|
if (device_count == 0) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
||||||
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,36 +195,31 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
|||||||
current_device = 0;
|
current_device = 0;
|
||||||
|
|
||||||
while (current_device < device_count) {
|
while (current_device < device_count) {
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(
|
||||||
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
|
||||||
current_device));
|
checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||||
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
|
||||||
|
|
||||||
int computeMode;
|
int computeMode;
|
||||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
|
||||||
current_device);
|
|
||||||
|
|
||||||
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
|
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
|
||||||
if (major == 9999 && minor == 9999) {
|
if (major == 9999 && minor == 9999) {
|
||||||
sm_per_multiproc = 1;
|
sm_per_multiproc = 1;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned long long compute_perf =
|
unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
|
||||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
|
||||||
clockRate);
|
|
||||||
|
|
||||||
if (compute_perf > max_compute_perf) {
|
if (compute_perf > max_compute_perf) {
|
||||||
max_compute_perf = compute_perf;
|
max_compute_perf = compute_perf;
|
||||||
max_perf_device = current_device;
|
max_perf_device = current_device;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
devices_prohibited++;
|
devices_prohibited++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,7 +237,8 @@ inline int gpuGetMaxGflopsDeviceIdDRV() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// General initialization call to pick the best CUDA Device
|
// General initialization call to pick the best CUDA Device
|
||||||
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
|
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
|
||||||
|
{
|
||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
int devID = 0;
|
int devID = 0;
|
||||||
|
|
||||||
@ -255,7 +250,8 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
|
|||||||
printf("exiting...\n");
|
printf("exiting...\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// Otherwise pick the device with highest Gflops/s
|
// Otherwise pick the device with highest Gflops/s
|
||||||
char name[100];
|
char name[100];
|
||||||
devID = gpuGetMaxGflopsDeviceIdDRV();
|
devID = gpuGetMaxGflopsDeviceIdDRV();
|
||||||
@ -269,7 +265,8 @@ inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
|
|||||||
return cuDevice;
|
return cuDevice;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline CUdevice findIntegratedGPUDrv() {
|
inline CUdevice findIntegratedGPUDrv()
|
||||||
|
{
|
||||||
CUdevice current_device = 0;
|
CUdevice current_device = 0;
|
||||||
int device_count = 0;
|
int device_count = 0;
|
||||||
int devices_prohibited = 0;
|
int devices_prohibited = 0;
|
||||||
@ -286,28 +283,22 @@ inline CUdevice findIntegratedGPUDrv() {
|
|||||||
// Find the integrated GPU which is compute capable
|
// Find the integrated GPU which is compute capable
|
||||||
while (current_device < device_count) {
|
while (current_device < device_count) {
|
||||||
int computeMode = -1;
|
int computeMode = -1;
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
|
||||||
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
|
checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
|
|
||||||
|
|
||||||
// If GPU is integrated and is not running on Compute Mode prohibited use
|
// If GPU is integrated and is not running on Compute Mode prohibited use
|
||||||
// that
|
// that
|
||||||
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
|
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
char deviceName[256];
|
char deviceName[256];
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||||
current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
|
||||||
current_device));
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
||||||
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
|
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
|
||||||
current_device, deviceName, major, minor);
|
|
||||||
|
|
||||||
return current_device;
|
return current_device;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
devices_prohibited++;
|
devices_prohibited++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -323,29 +314,26 @@ inline CUdevice findIntegratedGPUDrv() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// General check for CUDA GPU SM Capabilities
|
// General check for CUDA GPU SM Capabilities
|
||||||
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
|
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
|
||||||
int devID) {
|
{
|
||||||
CUdevice cuDevice;
|
CUdevice cuDevice;
|
||||||
char name[256];
|
char name[256];
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
|
||||||
|
|
||||||
if ((major > major_version) ||
|
if ((major > major_version) || (major == major_version && minor >= minor_version)) {
|
||||||
(major == major_version && minor >= minor_version)) {
|
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
|
||||||
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
|
|
||||||
major, minor);
|
|
||||||
return true;
|
return true;
|
||||||
} else {
|
}
|
||||||
printf(
|
else {
|
||||||
"No GPU device was found that can support CUDA compute capability "
|
printf("No GPU device was found that can support CUDA compute capability "
|
||||||
"%d.%d.\n",
|
"%d.%d.\n",
|
||||||
major_version, minor_version);
|
major_version,
|
||||||
|
minor_version);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -354,4 +342,3 @@ inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
|
|||||||
// end of CUDA Helper Functions
|
// end of CUDA Helper Functions
|
||||||
|
|
||||||
#endif // HELPER_CUDA_DRVAPI_H
|
#endif // HELPER_CUDA_DRVAPI_H
|
||||||
|
|
||||||
|
|||||||
@ -43,10 +43,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, CUDA
|
// includes, CUDA
|
||||||
#include "cuda_drvapi_dynlink.h"
|
#include "cuda_drvapi_dynlink.h"
|
||||||
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void randomInit(float *data, size_t size)
|
void randomInit(float *data, size_t size)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < size; ++i)
|
for (size_t i = 0; i < size; ++i) {
|
||||||
{
|
|
||||||
data[i] = rand() / (float)RAND_MAX;
|
data[i] = rand() / (float)RAND_MAX;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -100,18 +99,14 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||||
|
|
||||||
// This assumes that the user is attempting to specify a explicit device -device=n
|
// This assumes that the user is attempting to specify a explicit device -device=n
|
||||||
if (argc > 1)
|
if (argc > 1) {
|
||||||
{
|
|
||||||
bool bFound = false;
|
bool bFound = false;
|
||||||
|
|
||||||
for (int param=0; param < argc; param++)
|
for (int param = 0; param < argc; param++) {
|
||||||
{
|
if (!strncmp(argv[param], "-device", 7)) {
|
||||||
if (!strncmp(argv[param], "-device", 7))
|
|
||||||
{
|
|
||||||
int i = (int)strlen(argv[1]);
|
int i = (int)strlen(argv[1]);
|
||||||
|
|
||||||
while (argv[1][i] != '=')
|
while (argv[1][i] != '=') {
|
||||||
{
|
|
||||||
i--;
|
i--;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
int deviceCount = 0;
|
int deviceCount = 0;
|
||||||
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||||
|
|
||||||
if (deviceCount == 0)
|
if (deviceCount == 0) {
|
||||||
{
|
|
||||||
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
|
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (devID < 0) devID = 0;
|
if (devID < 0)
|
||||||
|
devID = 0;
|
||||||
|
|
||||||
if (devID > deviceCount -1)
|
if (devID > deviceCount - 1) {
|
||||||
{
|
|
||||||
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
|
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
|
||||||
status = CUDA_ERROR_NOT_FOUND;
|
status = CUDA_ERROR_NOT_FOUND;
|
||||||
|
|
||||||
@ -159,8 +153,7 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
// create context for picked device
|
// create context for picked device
|
||||||
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
|
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
|
||||||
|
|
||||||
if (CUDA_SUCCESS != status)
|
if (CUDA_SUCCESS != status) {
|
||||||
{
|
|
||||||
cuCtxDestroy(g_cuContext);
|
cuCtxDestroy(g_cuContext);
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@ -191,9 +184,11 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
printf("> Compiling CUDA module\n");
|
printf("> Compiling CUDA module\n");
|
||||||
|
|
||||||
#if defined(_WIN64) || defined(__LP64__)
|
#if defined(_WIN64) || defined(__LP64__)
|
||||||
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
status =
|
||||||
|
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||||
#else
|
#else
|
||||||
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
status =
|
||||||
|
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||||
@ -203,19 +198,17 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
|||||||
delete[] jitLogBuffer;
|
delete[] jitLogBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CUDA_SUCCESS != status)
|
if (CUDA_SUCCESS != status) {
|
||||||
{
|
|
||||||
printf("Error while compiling PTX\n");
|
printf("Error while compiling PTX\n");
|
||||||
cuCtxDestroy(g_cuContext);
|
cuCtxDestroy(g_cuContext);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// retrieve CUDA function from the compiled module
|
// retrieve CUDA function from the compiled module
|
||||||
status = cuModuleGetFunction(&cuFunction, cuModule,
|
status = cuModuleGetFunction(
|
||||||
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
|
&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
|
||||||
|
|
||||||
if (CUDA_SUCCESS != status)
|
if (CUDA_SUCCESS != status) {
|
||||||
{
|
|
||||||
cuCtxDestroy(g_cuContext);
|
cuCtxDestroy(g_cuContext);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -280,10 +273,8 @@ int main(int argc, char **argv)
|
|||||||
int Matrix_Width_B = WB;
|
int Matrix_Width_B = WB;
|
||||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
|
checkCudaErrors(cuLaunchKernel(
|
||||||
block_size , block_size , 1,
|
matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
|
||||||
0,
|
|
||||||
NULL, args, NULL));
|
|
||||||
}
|
}
|
||||||
#else // __CUDA_API_VERSION <= 3020
|
#else // __CUDA_API_VERSION <= 3020
|
||||||
{
|
{
|
||||||
@ -331,8 +322,7 @@ int main(int argc, char **argv)
|
|||||||
// check result
|
// check result
|
||||||
float diff = 0.0f;
|
float diff = 0.0f;
|
||||||
|
|
||||||
for (unsigned int i=0; i<size_C; i++)
|
for (unsigned int i = 0; i < size_C; i++) {
|
||||||
{
|
|
||||||
float tmp = reference[i] - h_C[i];
|
float tmp = reference[i] - h_C[i];
|
||||||
diff += tmp * tmp;
|
diff += tmp * tmp;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,8 +28,7 @@
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// export C interface
|
// export C interface
|
||||||
extern "C"
|
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||||
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Compute reference data set
|
//! Compute reference data set
|
||||||
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
|
|||||||
//! @param hA height of matrix A
|
//! @param hA height of matrix A
|
||||||
//! @param wB width of matrix B
|
//! @param wB width of matrix B
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void
|
void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
|
||||||
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
|
|
||||||
{
|
{
|
||||||
for (unsigned int i = 0; i < hA; ++i)
|
for (unsigned int i = 0; i < hA; ++i)
|
||||||
for (unsigned int j = 0; j < wB; ++j)
|
for (unsigned int j = 0; j < wB; ++j) {
|
||||||
{
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
|
|
||||||
for (unsigned int k = 0; k < wA; ++k)
|
for (unsigned int k = 0; k < wA; ++k) {
|
||||||
{
|
|
||||||
double a = A[i * wA + k];
|
double a = A[i * wA + k];
|
||||||
double b = B[k * wB + j];
|
double b = B[k * wB + j];
|
||||||
sum += a * b;
|
sum += a * b;
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,8 @@
|
|||||||
#define __matrixMul_kernel_32_ptxdump_h__
|
#define __matrixMul_kernel_32_ptxdump_h__
|
||||||
|
|
||||||
#if defined __cplusplus
|
#if defined __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern unsigned char matrixMul_kernel_32_ptxdump[25784];
|
extern unsigned char matrixMul_kernel_32_ptxdump[25784];
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,8 @@
|
|||||||
#define __matrixMul_kernel_64_ptxdump_h__
|
#define __matrixMul_kernel_64_ptxdump_h__
|
||||||
|
|
||||||
#if defined __cplusplus
|
#if defined __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern unsigned char matrixMul_kernel_64_ptxdump[26489];
|
extern unsigned char matrixMul_kernel_64_ptxdump[26489];
|
||||||
|
|||||||
@ -42,17 +42,19 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "nvrtc_helper.h"
|
#include "nvrtc_helper.h"
|
||||||
|
|
||||||
// Helper functions and utilities to work with CUDA
|
// Helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
|
|
||||||
void constantInit(float *data, int size, float val) {
|
void constantInit(float *data, int size, float val)
|
||||||
|
{
|
||||||
for (int i = 0; i < size; ++i) {
|
for (int i = 0; i < size; ++i) {
|
||||||
data[i] = val;
|
data[i] = val;
|
||||||
}
|
}
|
||||||
@ -61,8 +63,8 @@ void constantInit(float *data, int size, float val) {
|
|||||||
/**
|
/**
|
||||||
* Run a simple test of matrix multiplication using CUDA
|
* Run a simple test of matrix multiplication using CUDA
|
||||||
*/
|
*/
|
||||||
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
|
||||||
dim3 &dimsB) {
|
{
|
||||||
// Allocate host memory for matrices A and B
|
// Allocate host memory for matrices A and B
|
||||||
unsigned int size_A = dimsA.x * dimsA.y;
|
unsigned int size_A = dimsA.x * dimsA.y;
|
||||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||||
@ -114,24 +116,27 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
|||||||
|
|
||||||
CUfunction kernel_addr;
|
CUfunction kernel_addr;
|
||||||
if (block_size == 16) {
|
if (block_size == 16) {
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
|
||||||
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
|
}
|
||||||
} else {
|
else {
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
|
||||||
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
|
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
|
||||||
(void *)&dimsB.x};
|
|
||||||
|
|
||||||
// Execute the kernel
|
// Execute the kernel
|
||||||
int nIter = 300;
|
int nIter = 300;
|
||||||
|
|
||||||
for (int j = 0; j < nIter; j++) {
|
for (int j = 0; j < nIter; j++) {
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
|
grid.x,
|
||||||
threads.x, threads.y, threads.z, /* block dim */
|
grid.y,
|
||||||
0, 0, /* shared mem, stream */
|
grid.z, /* grid dim */
|
||||||
|
threads.x,
|
||||||
|
threads.y,
|
||||||
|
threads.z, /* block dim */
|
||||||
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
&arr[0], /* arguments */
|
&arr[0], /* arguments */
|
||||||
0));
|
0));
|
||||||
|
|
||||||
@ -157,16 +162,14 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
|||||||
double rel_err = abs_err / abs_val / dot_length;
|
double rel_err = abs_err / abs_val / dot_length;
|
||||||
|
|
||||||
if (rel_err > eps) {
|
if (rel_err > eps) {
|
||||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
|
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
|
||||||
h_C[i], dimsA.x * valB, eps);
|
|
||||||
correct = false;
|
correct = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
printf(
|
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||||
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
|
||||||
"Results may vary when GPU Boost is enabled.\n");
|
"Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
// Clean up memory
|
// Clean up memory
|
||||||
@ -180,7 +183,8 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
|||||||
|
|
||||||
if (correct) {
|
if (correct) {
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -189,16 +193,15 @@ int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
|||||||
* Program main
|
* Program main
|
||||||
*/
|
*/
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
|
||||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||||
printf(
|
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
||||||
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@ -234,13 +237,11 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (dimsA.x != dimsB.y) {
|
if (dimsA.x != dimsB.y) {
|
||||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
|
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
|
||||||
dimsA.x, dimsB.y);
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
|
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
|
||||||
dimsB.y);
|
|
||||||
|
|
||||||
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||||
|
|
||||||
|
|||||||
@ -48,11 +48,10 @@
|
|||||||
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
|
||||||
template <int BLOCK_SIZE>
|
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
|
||||||
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cooperative_groups::thread_block cta =
|
cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
|
||||||
cooperative_groups::this_thread_block();
|
|
||||||
// Block index
|
// Block index
|
||||||
int bx = blockIdx.x;
|
int bx = blockIdx.x;
|
||||||
int by = blockIdx.y;
|
int by = blockIdx.y;
|
||||||
@ -120,12 +119,12 @@ __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
|
|||||||
C[c + wB * ty + tx] = Csub;
|
C[c + wB * ty + tx] = Csub;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
|
||||||
int wA, int wB) {
|
{
|
||||||
matrixMulCUDA<16>(C, A, B, wA, wB);
|
matrixMulCUDA<16>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
|
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
|
||||||
int wA, int wB) {
|
{
|
||||||
matrixMulCUDA<32>(C, A, B, wA, wB);
|
matrixMulCUDA<32>(C, A, B, wA, wB);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -28,12 +28,13 @@
|
|||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
|
||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
#include <helper_cuda.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <helper_cuda.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
|
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
|
||||||
uint &valB, uint arrowDir) {
|
{
|
||||||
uint t;
|
uint t;
|
||||||
|
|
||||||
if ((keyA > keyB) == arrowDir) {
|
if ((keyA > keyB) == arrowDir) {
|
||||||
@ -46,9 +47,9 @@ inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
__global__ void
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
|
||||||
uint arrayLength, uint sortDir) {
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
// Shared memory storage for one or more short vectors
|
// Shared memory storage for one or more short vectors
|
||||||
@ -62,10 +63,8 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||||
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
|
||||||
|
|
||||||
for (uint size = 2; size < arrayLength; size <<= 1) {
|
for (uint size = 2; size < arrayLength; size <<= 1) {
|
||||||
// Bitonic merge
|
// Bitonic merge
|
||||||
@ -74,8 +73,7 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
for (uint stride = size / 2; stride > 0; stride >>= 1) {
|
for (uint stride = size / 2; stride > 0; stride >>= 1) {
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
|
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
|
||||||
s_val[pos + stride], dir);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,26 +82,25 @@ __global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
|
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
|
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
|
||||||
s_val[pos + stride], sortDir);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
|
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function (also used by odd-even merge sort)
|
// Helper function (also used by odd-even merge sort)
|
||||||
extern "C" uint factorRadix2(uint *log2L, uint L) {
|
extern "C" uint factorRadix2(uint *log2L, uint L)
|
||||||
|
{
|
||||||
if (!L) {
|
if (!L) {
|
||||||
*log2L = 0;
|
*log2L = 0;
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
|
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
|
||||||
;
|
;
|
||||||
|
|
||||||
@ -111,10 +108,14 @@ extern "C" uint factorRadix2(uint *log2L, uint L) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
extern "C" void bitonicSortShared(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint batchSize, uint arrayLength,
|
uint *d_SrcKey,
|
||||||
uint sortDir) {
|
uint *d_SrcVal,
|
||||||
|
uint batchSize,
|
||||||
|
uint arrayLength,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
// Nothing to sort
|
// Nothing to sort
|
||||||
if (arrayLength < 2) {
|
if (arrayLength < 2) {
|
||||||
return;
|
return;
|
||||||
@ -131,32 +132,25 @@ extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
|||||||
assert(arrayLength <= SHARED_SIZE_LIMIT);
|
assert(arrayLength <= SHARED_SIZE_LIMIT);
|
||||||
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
|
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
|
||||||
|
|
||||||
bitonicSortSharedKernel<<<blockCount, threadCount>>>(
|
bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
|
|
||||||
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
|
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 3: merge elementary intervals
|
// Merge step 3: merge elementary intervals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
|
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
|
||||||
return ((a % b) == 0) ? (a / b) : (a / b + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
|
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
|
||||||
return iDivUp(dividend, SAMPLE_STRIDE);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
|
static inline __device__ void
|
||||||
uint &flagA, uint &keyB,
|
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
|
||||||
uint &valB, uint &flagB,
|
{
|
||||||
uint arrowDir) {
|
|
||||||
uint t;
|
uint t;
|
||||||
|
|
||||||
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
|
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
|
||||||
((arrowDir == sortDir) && (flagA == 1)) ||
|
|| ((arrowDir != sortDir) && (flagB == 1))) {
|
||||||
((arrowDir != sortDir) && (flagB == 1))) {
|
|
||||||
t = keyA;
|
t = keyA;
|
||||||
keyA = keyB;
|
keyA = keyB;
|
||||||
keyB = t;
|
keyB = t;
|
||||||
@ -170,9 +164,15 @@ static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void bitonicMergeElementaryIntervalsKernel(
|
__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
|
||||||
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint *d_LimitsA,
|
||||||
|
uint *d_LimitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N)
|
||||||
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||||
@ -200,10 +200,8 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
|
|||||||
startSrcB = d_LimitsB[blockIdx.x];
|
startSrcB = d_LimitsB[blockIdx.x];
|
||||||
startDst = startSrcA + startSrcB;
|
startDst = startSrcA + startSrcB;
|
||||||
|
|
||||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
|
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
|
||||||
: segmentElementsA;
|
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
|
||||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
|
|
||||||
: segmentElementsB;
|
|
||||||
lenSrcA = endSrcA - startSrcA;
|
lenSrcA = endSrcA - startSrcA;
|
||||||
lenSrcB = endSrcB - startSrcB;
|
lenSrcB = endSrcB - startSrcB;
|
||||||
}
|
}
|
||||||
@ -222,10 +220,8 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
|
|||||||
|
|
||||||
// Prepare for bitonic merge by inversing the ordering
|
// Prepare for bitonic merge by inversing the ordering
|
||||||
if (threadIdx.x < lenSrcB) {
|
if (threadIdx.x < lenSrcB) {
|
||||||
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
|
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||||
d_SrcKey[stride + startSrcB + threadIdx.x];
|
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||||
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
|
|
||||||
d_SrcVal[stride + startSrcB + threadIdx.x];
|
|
||||||
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
|
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -233,9 +229,13 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
|
|||||||
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
|
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||||
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
|
ComparatorExtended<sortDir>(s_key[pos + 0],
|
||||||
s_key[pos + stride], s_val[pos + stride],
|
s_val[pos + 0],
|
||||||
s_inf[pos + stride], sortDir);
|
s_inf[pos + 0],
|
||||||
|
s_key[pos + stride],
|
||||||
|
s_val[pos + stride],
|
||||||
|
s_inf[pos + stride],
|
||||||
|
sortDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store sorted data
|
// Store sorted data
|
||||||
@ -254,26 +254,28 @@ __global__ void bitonicMergeElementaryIntervalsKernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
uint *d_LimitsA,
|
uint *d_LimitsA,
|
||||||
uint *d_LimitsB, uint stride,
|
uint *d_LimitsB,
|
||||||
uint N, uint sortDir) {
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
|
|
||||||
uint mergePairs = (lastSegmentElements > stride)
|
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
? getSampleCount(N)
|
|
||||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
|
bitonicMergeElementaryIntervalsKernel<1U>
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
N);
|
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||||
} else {
|
}
|
||||||
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
|
else {
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
bitonicMergeElementaryIntervalsKernel<0U>
|
||||||
N);
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -26,17 +26,19 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Test driver
|
// Test driver
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
|
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
|
||||||
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
|
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
|
||||||
StopWatchInterface *hTimer = NULL;
|
StopWatchInterface *hTimer = NULL;
|
||||||
@ -75,10 +77,8 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
|
checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
printf("Initializing GPU merge sort...\n");
|
printf("Initializing GPU merge sort...\n");
|
||||||
initMergeSort();
|
initMergeSort();
|
||||||
@ -93,10 +93,8 @@ int main(int argc, char **argv) {
|
|||||||
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
|
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
|
||||||
|
|
||||||
printf("Reading back GPU merge sort results...\n");
|
printf("Reading back GPU merge sort results...\n");
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
printf("Inspecting the results...\n");
|
printf("Inspecting the results...\n");
|
||||||
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
|
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
|
||||||
|
|||||||
@ -39,21 +39,19 @@
|
|||||||
namespace cg = cooperative_groups;
|
namespace cg = cooperative_groups;
|
||||||
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Helper functions
|
// Helper functions
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
|
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
|
||||||
return ((a % b) == 0) ? (a / b) : (a / b + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
|
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
|
||||||
return iDivUp(dividend, SAMPLE_STRIDE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define W (sizeof(uint) * 8)
|
#define W (sizeof(uint) * 8)
|
||||||
static inline __device__ uint nextPowerOfTwo(uint x) {
|
static inline __device__ uint nextPowerOfTwo(uint x)
|
||||||
|
{
|
||||||
/*
|
/*
|
||||||
--x;
|
--x;
|
||||||
x |= x >> 1;
|
x |= x >> 1;
|
||||||
@ -66,9 +64,8 @@ static inline __device__ uint nextPowerOfTwo(uint x) {
|
|||||||
return 1U << (W - __clz(x - 1));
|
return 1U << (W - __clz(x - 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
|
||||||
static inline __device__ uint binarySearchInclusive(uint val, uint *data,
|
{
|
||||||
uint L, uint stride) {
|
|
||||||
if (L == 0) {
|
if (L == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -78,8 +75,7 @@ static inline __device__ uint binarySearchInclusive(uint val, uint *data,
|
|||||||
for (; stride > 0; stride >>= 1) {
|
for (; stride > 0; stride >>= 1) {
|
||||||
uint newPos = umin(pos + stride, L);
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] <= val)) ||
|
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
|
||||||
(!sortDir && (data[newPos - 1] >= val))) {
|
|
||||||
pos = newPos;
|
pos = newPos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -87,9 +83,8 @@ static inline __device__ uint binarySearchInclusive(uint val, uint *data,
|
|||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
|
||||||
static inline __device__ uint binarySearchExclusive(uint val, uint *data,
|
{
|
||||||
uint L, uint stride) {
|
|
||||||
if (L == 0) {
|
if (L == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -99,8 +94,7 @@ static inline __device__ uint binarySearchExclusive(uint val, uint *data,
|
|||||||
for (; stride > 0; stride >>= 1) {
|
for (; stride > 0; stride >>= 1) {
|
||||||
uint newPos = umin(pos + stride, L);
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] < val)) ||
|
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
|
||||||
(!sortDir && (data[newPos - 1] > val))) {
|
|
||||||
pos = newPos;
|
pos = newPos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -112,9 +106,8 @@ static inline __device__ uint binarySearchExclusive(uint val, uint *data,
|
|||||||
// Bottom-level merge sort (binary search-based)
|
// Bottom-level merge sort (binary search-based)
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
{
|
||||||
uint arrayLength) {
|
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||||
@ -126,10 +119,8 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||||
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
|
||||||
|
|
||||||
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
|
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
|
||||||
uint lPos = threadIdx.x & (stride - 1);
|
uint lPos = threadIdx.x & (stride - 1);
|
||||||
@ -141,12 +132,8 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
uint valA = baseVal[lPos + 0];
|
uint valA = baseVal[lPos + 0];
|
||||||
uint keyB = baseKey[lPos + stride];
|
uint keyB = baseKey[lPos + stride];
|
||||||
uint valB = baseVal[lPos + stride];
|
uint valB = baseVal[lPos + stride];
|
||||||
uint posA =
|
uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
|
||||||
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
|
uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
|
||||||
lPos;
|
|
||||||
uint posB =
|
|
||||||
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
|
|
||||||
lPos;
|
|
||||||
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
baseKey[posA] = keyA;
|
baseKey[posA] = keyA;
|
||||||
@ -158,15 +145,18 @@ __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
|
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
|
|
||||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
|
static void mergeSortShared(uint *d_DstKey,
|
||||||
uint *d_SrcVal, uint batchSize, uint arrayLength,
|
uint *d_DstVal,
|
||||||
uint sortDir) {
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint batchSize,
|
||||||
|
uint arrayLength,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
if (arrayLength < 2) {
|
if (arrayLength < 2) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -177,12 +167,11 @@ static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
|
|||||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
|
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
|
||||||
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
|
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
|
||||||
} else {
|
}
|
||||||
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
|
else {
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||||
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
|
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -191,9 +180,9 @@ static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
|
|||||||
// Merge step 1: generate sample ranks
|
// Merge step 1: generate sample ranks
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
|
__global__ void
|
||||||
uint *d_SrcKey, uint stride, uint N,
|
generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
|
||||||
uint threadCount) {
|
{
|
||||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (pos >= threadCount) {
|
if (pos >= threadCount) {
|
||||||
@ -214,33 +203,30 @@ __global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
|
|||||||
if (i < segmentSamplesA) {
|
if (i < segmentSamplesA) {
|
||||||
d_RanksA[i] = i * SAMPLE_STRIDE;
|
d_RanksA[i] = i * SAMPLE_STRIDE;
|
||||||
d_RanksB[i] = binarySearchExclusive<sortDir>(
|
d_RanksB[i] = binarySearchExclusive<sortDir>(
|
||||||
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
|
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
||||||
nextPowerOfTwo(segmentElementsB));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < segmentSamplesB) {
|
if (i < segmentSamplesB) {
|
||||||
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||||
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
|
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
|
||||||
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
|
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
||||||
nextPowerOfTwo(segmentElementsA));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
|
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
|
||||||
uint stride, uint N, uint sortDir) {
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint threadCount =
|
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
|
generateSampleRanksKernel<1U>
|
||||||
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||||
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
|
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
|
||||||
} else {
|
}
|
||||||
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
|
else {
|
||||||
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
generateSampleRanksKernel<0U>
|
||||||
|
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||||
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
|
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -248,9 +234,8 @@ static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 2: generate sample ranks and indices
|
// Merge step 2: generate sample ranks and indices
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
|
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
|
||||||
uint stride, uint N,
|
{
|
||||||
uint threadCount) {
|
|
||||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (pos >= threadCount) {
|
if (pos >= threadCount) {
|
||||||
@ -269,36 +254,29 @@ __global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
|
|||||||
|
|
||||||
if (i < segmentSamplesA) {
|
if (i < segmentSamplesA) {
|
||||||
uint dstPos = binarySearchExclusive<1U>(
|
uint dstPos = binarySearchExclusive<1U>(
|
||||||
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
|
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
|
||||||
nextPowerOfTwo(segmentSamplesB)) +
|
+ i;
|
||||||
i;
|
|
||||||
d_Limits[dstPos] = d_Ranks[i];
|
d_Limits[dstPos] = d_Ranks[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < segmentSamplesB) {
|
if (i < segmentSamplesB) {
|
||||||
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
|
uint dstPos = binarySearchInclusive<1U>(
|
||||||
d_Ranks, segmentSamplesA,
|
d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
|
||||||
nextPowerOfTwo(segmentSamplesA)) +
|
+ i;
|
||||||
i;
|
|
||||||
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
|
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
|
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
|
||||||
uint *d_RanksA, uint *d_RanksB, uint stride,
|
{
|
||||||
uint N) {
|
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint threadCount =
|
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
|
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
|
||||||
d_LimitsA, d_RanksA, stride, N, threadCount);
|
|
||||||
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
|
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
|
||||||
|
|
||||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
|
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
|
||||||
d_LimitsB, d_RanksB, stride, N, threadCount);
|
|
||||||
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
|
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -306,24 +284,30 @@ static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
|
|||||||
// Merge step 3: merge elementary intervals
|
// Merge step 3: merge elementary intervals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
|
inline __device__ void merge(uint *dstKey,
|
||||||
uint *srcAVal, uint *srcBKey, uint *srcBVal,
|
uint *dstVal,
|
||||||
uint lenA, uint nPowTwoLenA, uint lenB,
|
uint *srcAKey,
|
||||||
uint nPowTwoLenB, cg::thread_block cta) {
|
uint *srcAVal,
|
||||||
|
uint *srcBKey,
|
||||||
|
uint *srcBVal,
|
||||||
|
uint lenA,
|
||||||
|
uint nPowTwoLenA,
|
||||||
|
uint lenB,
|
||||||
|
uint nPowTwoLenB,
|
||||||
|
cg::thread_block cta)
|
||||||
|
{
|
||||||
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
|
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
|
||||||
|
|
||||||
if (threadIdx.x < lenA) {
|
if (threadIdx.x < lenA) {
|
||||||
keyA = srcAKey[threadIdx.x];
|
keyA = srcAKey[threadIdx.x];
|
||||||
valA = srcAVal[threadIdx.x];
|
valA = srcAVal[threadIdx.x];
|
||||||
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
|
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
|
||||||
threadIdx.x;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (threadIdx.x < lenB) {
|
if (threadIdx.x < lenB) {
|
||||||
keyB = srcBKey[threadIdx.x];
|
keyB = srcBKey[threadIdx.x];
|
||||||
valB = srcBVal[threadIdx.x];
|
valB = srcBVal[threadIdx.x];
|
||||||
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
|
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
|
||||||
threadIdx.x;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
@ -340,10 +324,15 @@ inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <uint sortDir>
|
template <uint sortDir>
|
||||||
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
|
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint *d_LimitsA, uint *d_LimitsB,
|
uint *d_SrcKey,
|
||||||
uint stride, uint N) {
|
uint *d_SrcVal,
|
||||||
|
uint *d_LimitsA,
|
||||||
|
uint *d_LimitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N)
|
||||||
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||||
@ -368,10 +357,8 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
|
|
||||||
startSrcA = d_LimitsA[blockIdx.x];
|
startSrcA = d_LimitsA[blockIdx.x];
|
||||||
startSrcB = d_LimitsB[blockIdx.x];
|
startSrcB = d_LimitsB[blockIdx.x];
|
||||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
|
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
|
||||||
: segmentElementsA;
|
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
|
||||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
|
|
||||||
: segmentElementsB;
|
|
||||||
lenSrcA = endSrcA - startSrcA;
|
lenSrcA = endSrcA - startSrcA;
|
||||||
lenSrcB = endSrcB - startSrcB;
|
lenSrcB = endSrcB - startSrcB;
|
||||||
startDstA = startSrcA + startSrcB;
|
startDstA = startSrcA + startSrcB;
|
||||||
@ -387,17 +374,23 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (threadIdx.x < lenSrcB) {
|
if (threadIdx.x < lenSrcB) {
|
||||||
s_key[threadIdx.x + SAMPLE_STRIDE] =
|
s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||||
d_SrcKey[stride + startSrcB + threadIdx.x];
|
s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||||
s_val[threadIdx.x + SAMPLE_STRIDE] =
|
|
||||||
d_SrcVal[stride + startSrcB + threadIdx.x];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merge data in shared memory
|
// Merge data in shared memory
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
|
merge<sortDir>(s_key,
|
||||||
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
|
s_val,
|
||||||
SAMPLE_STRIDE, cta);
|
s_key + 0,
|
||||||
|
s_val + 0,
|
||||||
|
s_key + SAMPLE_STRIDE,
|
||||||
|
s_val + SAMPLE_STRIDE,
|
||||||
|
lenSrcA,
|
||||||
|
SAMPLE_STRIDE,
|
||||||
|
lenSrcB,
|
||||||
|
SAMPLE_STRIDE,
|
||||||
|
cta);
|
||||||
|
|
||||||
// Store merged data
|
// Store merged data
|
||||||
cg::sync(cta);
|
cg::sync(cta);
|
||||||
@ -413,63 +406,77 @@ __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
static void mergeElementaryIntervals(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint *d_LimitsA, uint *d_LimitsB,
|
uint *d_SrcKey,
|
||||||
uint stride, uint N, uint sortDir) {
|
uint *d_SrcVal,
|
||||||
|
uint *d_LimitsA,
|
||||||
|
uint *d_LimitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint mergePairs = (lastSegmentElements > stride)
|
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
? getSampleCount(N)
|
|
||||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
|
||||||
|
|
||||||
if (sortDir) {
|
if (sortDir) {
|
||||||
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
|
mergeElementaryIntervalsKernel<1U>
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
N);
|
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||||
} else {
|
}
|
||||||
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
|
else {
|
||||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
mergeElementaryIntervalsKernel<0U>
|
||||||
N);
|
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
extern "C" void bitonicSortShared(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint batchSize, uint arrayLength,
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint batchSize,
|
||||||
|
uint arrayLength,
|
||||||
uint sortDir);
|
uint sortDir);
|
||||||
|
|
||||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
|
||||||
uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
uint *d_LimitsA,
|
uint *d_LimitsA,
|
||||||
uint *d_LimitsB, uint stride,
|
uint *d_LimitsB,
|
||||||
uint N, uint sortDir);
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir);
|
||||||
|
|
||||||
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
|
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
|
||||||
static const uint MAX_SAMPLE_COUNT = 32768;
|
static const uint MAX_SAMPLE_COUNT = 32768;
|
||||||
|
|
||||||
extern "C" void initMergeSort(void) {
|
extern "C" void initMergeSort(void)
|
||||||
checkCudaErrors(
|
{
|
||||||
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||||
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
|
||||||
checkCudaErrors(
|
|
||||||
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void closeMergeSort(void) {
|
extern "C" void closeMergeSort(void)
|
||||||
|
{
|
||||||
checkCudaErrors(cudaFree(d_RanksA));
|
checkCudaErrors(cudaFree(d_RanksA));
|
||||||
checkCudaErrors(cudaFree(d_RanksB));
|
checkCudaErrors(cudaFree(d_RanksB));
|
||||||
checkCudaErrors(cudaFree(d_LimitsB));
|
checkCudaErrors(cudaFree(d_LimitsB));
|
||||||
checkCudaErrors(cudaFree(d_LimitsA));
|
checkCudaErrors(cudaFree(d_LimitsA));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
|
extern "C" void mergeSort(uint *d_DstKey,
|
||||||
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
|
uint *d_DstVal,
|
||||||
uint N, uint sortDir) {
|
uint *d_BufKey,
|
||||||
|
uint *d_BufVal,
|
||||||
|
uint *d_SrcKey,
|
||||||
|
uint *d_SrcVal,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
uint stageCount = 0;
|
uint stageCount = 0;
|
||||||
|
|
||||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||||
@ -482,7 +489,8 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
|
|||||||
ival = d_BufVal;
|
ival = d_BufVal;
|
||||||
okey = d_DstKey;
|
okey = d_DstKey;
|
||||||
oval = d_DstVal;
|
oval = d_DstVal;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
ikey = d_DstKey;
|
ikey = d_DstKey;
|
||||||
ival = d_DstVal;
|
ival = d_DstVal;
|
||||||
okey = d_BufKey;
|
okey = d_BufKey;
|
||||||
@ -491,8 +499,7 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
|
|||||||
|
|
||||||
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
|
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
|
||||||
assert(N % SHARED_SIZE_LIMIT == 0);
|
assert(N % SHARED_SIZE_LIMIT == 0);
|
||||||
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
|
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
|
||||||
SHARED_SIZE_LIMIT, sortDir);
|
|
||||||
|
|
||||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
@ -504,18 +511,19 @@ extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
|
|||||||
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
|
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
|
||||||
|
|
||||||
// Merge elementary intervals
|
// Merge elementary intervals
|
||||||
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
|
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
|
||||||
stride, N, sortDir);
|
|
||||||
|
|
||||||
if (lastSegmentElements <= stride) {
|
if (lastSegmentElements <= stride) {
|
||||||
// Last merge segment consists of a single array which just needs to be
|
// Last merge segment consists of a single array which just needs to be
|
||||||
// passed through
|
// passed through
|
||||||
checkCudaErrors(cudaMemcpy(
|
checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
|
||||||
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
|
ikey + (N - lastSegmentElements),
|
||||||
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
|
lastSegmentElements * sizeof(uint),
|
||||||
checkCudaErrors(cudaMemcpy(
|
cudaMemcpyDeviceToDevice));
|
||||||
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
|
checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
|
||||||
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
|
ival + (N - lastSegmentElements),
|
||||||
|
lastSegmentElements * sizeof(uint),
|
||||||
|
cudaMemcpyDeviceToDevice));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint *t;
|
uint *t;
|
||||||
|
|||||||
@ -36,14 +36,12 @@ typedef unsigned int uint;
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Extensive sort validation routine
|
// Extensive sort validation routine
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
extern "C" uint
|
||||||
uint arrayLength, uint numValues,
|
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
|
||||||
uint sortDir);
|
|
||||||
|
|
||||||
extern "C" void fillValues(uint *val, uint N);
|
extern "C" void fillValues(uint *val, uint N);
|
||||||
|
|
||||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
|
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
|
||||||
uint batchSize, uint arrayLength);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// CUDA merge sort
|
// CUDA merge sort
|
||||||
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
|
|||||||
|
|
||||||
extern "C" void closeMergeSort(void);
|
extern "C" void closeMergeSort(void);
|
||||||
|
|
||||||
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
|
extern "C" void
|
||||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
|
||||||
uint sortDir);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// CPU "emulation"
|
// CPU "emulation"
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
extern "C" void
|
||||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
|
||||||
uint sortDir);
|
|
||||||
|
|||||||
@ -29,19 +29,20 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Helper functions
|
// Helper functions
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void checkOrder(uint *data, uint N, uint sortDir) {
|
static void checkOrder(uint *data, uint N, uint sortDir)
|
||||||
|
{
|
||||||
if (N <= 1) {
|
if (N <= 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint i = 0; i < N - 1; i++)
|
for (uint i = 0; i < N - 1; i++)
|
||||||
if ((sortDir && (data[i] > data[i + 1])) ||
|
if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
|
||||||
(!sortDir && (data[i] < data[i + 1]))) {
|
|
||||||
fprintf(stderr, "checkOrder() failed!!!\n");
|
fprintf(stderr, "checkOrder() failed!!!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -49,12 +50,13 @@ static void checkOrder(uint *data, uint N, uint sortDir) {
|
|||||||
|
|
||||||
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
|
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
|
||||||
|
|
||||||
static uint getSampleCount(uint dividend) {
|
static uint getSampleCount(uint dividend)
|
||||||
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
|
{
|
||||||
: (dividend / SAMPLE_STRIDE);
|
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint nextPowerOfTwo(uint x) {
|
static uint nextPowerOfTwo(uint x)
|
||||||
|
{
|
||||||
--x;
|
--x;
|
||||||
x |= x >> 1;
|
x |= x >> 1;
|
||||||
x |= x >> 2;
|
x |= x >> 2;
|
||||||
@ -64,7 +66,8 @@ static uint nextPowerOfTwo(uint x) {
|
|||||||
return ++x;
|
return ++x;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
|
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
|
||||||
|
{
|
||||||
if (L == 0) {
|
if (L == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -74,8 +77,7 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
|
|||||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||||
uint newPos = umin(pos + stride, L);
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] <= val)) ||
|
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
|
||||||
(!sortDir && (data[newPos - 1] >= val))) {
|
|
||||||
pos = newPos;
|
pos = newPos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -83,7 +85,8 @@ static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
|
|||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
|
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
|
||||||
|
{
|
||||||
if (L == 0) {
|
if (L == 0) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -93,8 +96,7 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
|
|||||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||||
uint newPos = umin(pos + stride, L);
|
uint newPos = umin(pos + stride, L);
|
||||||
|
|
||||||
if ((sortDir && (data[newPos - 1] < val)) ||
|
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
|
||||||
(!sortDir && (data[newPos - 1] > val))) {
|
|
||||||
pos = newPos;
|
pos = newPos;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -105,12 +107,10 @@ static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 1: find sample ranks in each segment
|
// Merge step 1: find sample ranks in each segment
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
|
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
|
||||||
uint stride, uint N, uint sortDir) {
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint sampleCount =
|
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||||
@ -124,17 +124,14 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
|
|||||||
|
|
||||||
if (i < nA) {
|
if (i < nA) {
|
||||||
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||||
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
|
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
|
||||||
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
|
srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
|
||||||
srcKey + segmentBase + stride, lenB, sortDir);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < nB) {
|
if (i < nB) {
|
||||||
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||||
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
|
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
|
||||||
binarySearchInclusive(
|
srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
|
||||||
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
|
|
||||||
srcKey + segmentBase, lenA, sortDir);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,12 +139,10 @@ static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 2: merge ranks and indices to derive elementary intervals
|
// Merge step 2: merge ranks and indices to derive elementary intervals
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
|
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
|
||||||
uint N) {
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint sampleCount =
|
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||||
(lastSegmentElements > stride)
|
|
||||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
|
||||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||||
|
|
||||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||||
@ -161,23 +156,20 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
|
|||||||
|
|
||||||
if (i < nA) {
|
if (i < nA) {
|
||||||
uint dstPosA =
|
uint dstPosA =
|
||||||
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
|
binarySearchExclusive(
|
||||||
ranks + (segmentBase + stride) / SAMPLE_STRIDE,
|
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
|
||||||
nB, 1) +
|
+ i;
|
||||||
i;
|
|
||||||
assert(dstPosA < nA + nB);
|
assert(dstPosA < nA + nB);
|
||||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
|
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
|
||||||
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i < nB) {
|
if (i < nB) {
|
||||||
uint dstPosA = binarySearchInclusive(
|
uint dstPosA =
|
||||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
|
binarySearchInclusive(
|
||||||
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
|
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
|
||||||
i;
|
+ i;
|
||||||
assert(dstPosA < nA + nB);
|
assert(dstPosA < nA + nB);
|
||||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
|
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
|
||||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -185,9 +177,16 @@ static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
|
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
|
static void merge(uint *dstKey,
|
||||||
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
|
uint *dstVal,
|
||||||
uint sortDir) {
|
uint *srcAKey,
|
||||||
|
uint *srcAVal,
|
||||||
|
uint *srcBKey,
|
||||||
|
uint *srcBVal,
|
||||||
|
uint lenA,
|
||||||
|
uint lenB,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
checkOrder(srcAKey, lenA, sortDir);
|
checkOrder(srcAKey, lenA, sortDir);
|
||||||
checkOrder(srcBKey, lenB, sortDir);
|
checkOrder(srcBKey, lenB, sortDir);
|
||||||
|
|
||||||
@ -206,13 +205,18 @@ static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
|
static void mergeElementaryIntervals(uint *dstKey,
|
||||||
uint *srcVal, uint *limitsA, uint *limitsB,
|
uint *dstVal,
|
||||||
uint stride, uint N, uint sortDir) {
|
uint *srcKey,
|
||||||
|
uint *srcVal,
|
||||||
|
uint *limitsA,
|
||||||
|
uint *limitsB,
|
||||||
|
uint stride,
|
||||||
|
uint N,
|
||||||
|
uint sortDir)
|
||||||
|
{
|
||||||
uint lastSegmentElements = N % (2 * stride);
|
uint lastSegmentElements = N % (2 * stride);
|
||||||
uint mergePairs = (lastSegmentElements > stride)
|
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||||
? getSampleCount(N)
|
|
||||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
|
||||||
|
|
||||||
for (uint pos = 0; pos < mergePairs; pos++) {
|
for (uint pos = 0; pos < mergePairs; pos++) {
|
||||||
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
|
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||||
@ -240,15 +244,18 @@ static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
|
|||||||
(srcKey + segmentBase + 0) + startPosA,
|
(srcKey + segmentBase + 0) + startPosA,
|
||||||
(srcVal + segmentBase + 0) + startPosA,
|
(srcVal + segmentBase + 0) + startPosA,
|
||||||
(srcKey + segmentBase + stride) + startPosB,
|
(srcKey + segmentBase + stride) + startPosB,
|
||||||
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
|
(srcVal + segmentBase + stride) + startPosB,
|
||||||
endPosB - startPosB, sortDir);
|
endPosA - startPosA,
|
||||||
|
endPosB - startPosB,
|
||||||
|
sortDir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Retarded bubble sort
|
// Retarded bubble sort
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
|
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
|
||||||
|
{
|
||||||
if (N <= 1) {
|
if (N <= 1) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -278,9 +285,9 @@ static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Interface function
|
// Interface function
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
extern "C" void
|
||||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
|
||||||
uint sortDir) {
|
{
|
||||||
uint *ikey, *ival, *okey, *oval;
|
uint *ikey, *ival, *okey, *oval;
|
||||||
uint stageCount = 0;
|
uint stageCount = 0;
|
||||||
|
|
||||||
@ -292,7 +299,8 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
|||||||
ival = bufVal;
|
ival = bufVal;
|
||||||
okey = dstKey;
|
okey = dstKey;
|
||||||
oval = dstVal;
|
oval = dstVal;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
ikey = dstKey;
|
ikey = dstKey;
|
||||||
ival = dstVal;
|
ival = dstVal;
|
||||||
okey = bufKey;
|
okey = bufKey;
|
||||||
@ -304,8 +312,7 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
|||||||
memcpy(ival, srcVal, N * sizeof(uint));
|
memcpy(ival, srcVal, N * sizeof(uint));
|
||||||
|
|
||||||
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
|
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
|
||||||
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
|
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
|
||||||
sortDir);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Merge...\n");
|
printf("Merge...\n");
|
||||||
@ -329,16 +336,15 @@ extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
|||||||
mergeRanksAndIndices(limitsB, ranksB, stride, N);
|
mergeRanksAndIndices(limitsB, ranksB, stride, N);
|
||||||
|
|
||||||
// Merge elementary intervals
|
// Merge elementary intervals
|
||||||
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
|
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
|
||||||
N, sortDir);
|
|
||||||
|
|
||||||
if (lastSegmentElements <= stride) {
|
if (lastSegmentElements <= stride) {
|
||||||
// Last merge segment consists of a single array which just needs to be
|
// Last merge segment consists of a single array which just needs to be
|
||||||
// passed through
|
// passed through
|
||||||
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
|
memcpy(
|
||||||
lastSegmentElements * sizeof(uint));
|
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
|
||||||
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
|
memcpy(
|
||||||
lastSegmentElements * sizeof(uint));
|
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
|
||||||
}
|
}
|
||||||
|
|
||||||
uint *t;
|
uint *t;
|
||||||
|
|||||||
@ -29,14 +29,15 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
#include "mergeSort_common.h"
|
#include "mergeSort_common.h"
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Validate sorted keys array (check for integrity and proper order)
|
// Validate sorted keys array (check for integrity and proper order)
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
extern "C" uint
|
||||||
uint arrayLength, uint numValues,
|
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
|
||||||
uint sortDir) {
|
{
|
||||||
uint *srcHist;
|
uint *srcHist;
|
||||||
uint *resHist;
|
uint *resHist;
|
||||||
|
|
||||||
@ -51,8 +52,7 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
|||||||
|
|
||||||
int flag = 1;
|
int flag = 1;
|
||||||
|
|
||||||
for (uint j = 0; j < batchSize;
|
for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
|
||||||
j++, srcKey += arrayLength, resKey += arrayLength) {
|
|
||||||
// Build histograms for keys arrays
|
// Build histograms for keys arrays
|
||||||
memset(srcHist, 0, numValues * sizeof(uint));
|
memset(srcHist, 0, numValues * sizeof(uint));
|
||||||
memset(resHist, 0, numValues * sizeof(uint));
|
memset(resHist, 0, numValues * sizeof(uint));
|
||||||
@ -61,11 +61,9 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
|||||||
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
|
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
|
||||||
srcHist[srcKey[i]]++;
|
srcHist[srcKey[i]]++;
|
||||||
resHist[resKey[i]]++;
|
resHist[resKey[i]]++;
|
||||||
} else {
|
}
|
||||||
fprintf(
|
else {
|
||||||
stderr,
|
fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
|
||||||
"***Set %u source/result key arrays are not limited properly***\n",
|
|
||||||
j);
|
|
||||||
flag = 0;
|
flag = 0;
|
||||||
goto brk;
|
goto brk;
|
||||||
}
|
}
|
||||||
@ -74,18 +72,15 @@ extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
|||||||
// Compare the histograms
|
// Compare the histograms
|
||||||
for (uint i = 0; i < numValues; i++)
|
for (uint i = 0; i < numValues; i++)
|
||||||
if (srcHist[i] != resHist[i]) {
|
if (srcHist[i] != resHist[i]) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
|
||||||
"***Set %u source/result keys histograms do not match***\n", j);
|
|
||||||
flag = 0;
|
flag = 0;
|
||||||
goto brk;
|
goto brk;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally check the ordering
|
// Finally check the ordering
|
||||||
for (uint i = 0; i < arrayLength - 1; i++)
|
for (uint i = 0; i < arrayLength - 1; i++)
|
||||||
if ((sortDir && (resKey[i] > resKey[i + 1])) ||
|
if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
|
||||||
(!sortDir && (resKey[i] < resKey[i + 1]))) {
|
fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
|
||||||
fprintf(stderr,
|
|
||||||
"***Set %u result key array is not ordered properly***\n", j);
|
|
||||||
flag = 0;
|
flag = 0;
|
||||||
goto brk;
|
goto brk;
|
||||||
}
|
}
|
||||||
@ -95,7 +90,8 @@ brk:
|
|||||||
free(resHist);
|
free(resHist);
|
||||||
free(srcHist);
|
free(srcHist);
|
||||||
|
|
||||||
if (flag) printf("OK\n");
|
if (flag)
|
||||||
|
printf("OK\n");
|
||||||
|
|
||||||
return flag;
|
return flag;
|
||||||
}
|
}
|
||||||
@ -103,30 +99,30 @@ brk:
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Value validation / stability check routines
|
// Value validation / stability check routines
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void fillValues(uint *val, uint N) {
|
extern "C" void fillValues(uint *val, uint N)
|
||||||
for (uint i = 0; i < N; i++) val[i] = i;
|
{
|
||||||
|
for (uint i = 0; i < N; i++)
|
||||||
|
val[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
|
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
|
||||||
uint batchSize, uint arrayLength) {
|
{
|
||||||
int correctFlag = 1, stableFlag = 1;
|
int correctFlag = 1, stableFlag = 1;
|
||||||
|
|
||||||
printf("...inspecting keys and values array: ");
|
printf("...inspecting keys and values array: ");
|
||||||
|
|
||||||
for (uint i = 0; i < batchSize;
|
for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
|
||||||
i++, resKey += arrayLength, resVal += arrayLength) {
|
|
||||||
for (uint j = 0; j < arrayLength; j++) {
|
for (uint j = 0; j < arrayLength; j++) {
|
||||||
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
|
if (resKey[j] != srcKey[resVal[j]])
|
||||||
|
correctFlag = 0;
|
||||||
|
|
||||||
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
|
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
|
||||||
(resVal[j] > resVal[j + 1]))
|
|
||||||
stableFlag = 0;
|
stableFlag = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
|
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
|
||||||
printf(stableFlag ? "...stability property: stable!\n"
|
printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
|
||||||
: "...stability property: NOT stable\n");
|
|
||||||
|
|
||||||
return correctFlag;
|
return correctFlag;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,9 +29,9 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// Includes CUDA
|
// Includes CUDA
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda/barrier>
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
#include <cuda/barrier>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||||
@ -43,9 +43,11 @@ namespace cg = cooperative_groups;
|
|||||||
|
|
||||||
#if __CUDA_ARCH__ >= 700
|
#if __CUDA_ARCH__ >= 700
|
||||||
template <bool writeSquareRoot>
|
template <bool writeSquareRoot>
|
||||||
__device__ void reduceBlockData(
|
__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
|
||||||
cuda::barrier<cuda::thread_scope_block> &barrier,
|
cg::thread_block_tile<32> &tile32,
|
||||||
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
|
double &threadSum,
|
||||||
|
double *result)
|
||||||
|
{
|
||||||
extern __shared__ double tmp[];
|
extern __shared__ double tmp[];
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
@ -62,9 +64,7 @@ __device__ void reduceBlockData(
|
|||||||
|
|
||||||
// The warp 0 will perform last round of reduction
|
// The warp 0 will perform last round of reduction
|
||||||
if (tile32.meta_group_rank() == 0) {
|
if (tile32.meta_group_rank() == 0) {
|
||||||
double beta = tile32.thread_rank() < tile32.meta_group_size()
|
double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
|
||||||
? tmp[tile32.thread_rank()]
|
|
||||||
: 0.0;
|
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
||||||
@ -81,8 +81,8 @@ __device__ void reduceBlockData(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
|
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
|
||||||
double *partialResults, int size) {
|
{
|
||||||
#if __CUDA_ARCH__ >= 700
|
#if __CUDA_ARCH__ >= 700
|
||||||
#pragma diag_suppress static_var_with_dynamic_init
|
#pragma diag_suppress static_var_with_dynamic_init
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
@ -105,8 +105,7 @@ __global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
|
|||||||
|
|
||||||
// Each thread block performs reduction of partial dotProducts and writes to
|
// Each thread block performs reduction of partial dotProducts and writes to
|
||||||
// global mem.
|
// global mem.
|
||||||
reduceBlockData<false>(barrier, tile32, threadSum,
|
reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
|
||||||
&partialResults[blockIdx.x]);
|
|
||||||
|
|
||||||
cg::sync(grid);
|
cg::sync(grid);
|
||||||
|
|
||||||
@ -137,15 +136,15 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", argv[0]);
|
printf("%s starting...\n", argv[0]);
|
||||||
|
|
||||||
// This will pick the best possible CUDA capable device
|
// This will pick the best possible CUDA capable device
|
||||||
int dev = findCudaDevice(argc, (const char **)argv);
|
int dev = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
int major = 0;
|
int major = 0;
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
||||||
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
|
||||||
|
|
||||||
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
|
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
|
||||||
if (major < 7) {
|
if (major < 7) {
|
||||||
@ -154,12 +153,10 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int supportsCooperativeLaunch = 0;
|
int supportsCooperativeLaunch = 0;
|
||||||
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
|
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
|
||||||
cudaDevAttrCooperativeLaunch, dev));
|
|
||||||
|
|
||||||
if (!supportsCooperativeLaunch) {
|
if (!supportsCooperativeLaunch) {
|
||||||
printf(
|
printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
|
||||||
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
|
|
||||||
"Waiving the run\n",
|
"Waiving the run\n",
|
||||||
dev);
|
dev);
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -171,7 +168,8 @@ int main(int argc, char **argv) {
|
|||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
|
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
|
||||||
|
{
|
||||||
float *vecA, *d_vecA;
|
float *vecA, *d_vecA;
|
||||||
float *vecB, *d_vecB;
|
float *vecB, *d_vecB;
|
||||||
double *d_partialResults;
|
double *d_partialResults;
|
||||||
@ -191,16 +189,14 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
|
|||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
|
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
|
||||||
cudaMemcpyHostToDevice, stream));
|
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// Kernel configuration, where a one-dimensional
|
// Kernel configuration, where a one-dimensional
|
||||||
// grid and one-dimensional blocks are configured.
|
// grid and one-dimensional blocks are configured.
|
||||||
int minGridSize = 0, blockSize = 0;
|
int minGridSize = 0, blockSize = 0;
|
||||||
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
|
checkCudaErrors(
|
||||||
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
|
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
|
||||||
|
|
||||||
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
|
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
|
||||||
|
|
||||||
@ -209,28 +205,24 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
|
|||||||
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
|
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
|
||||||
|
|
||||||
int multiProcessorCount = 0;
|
int multiProcessorCount = 0;
|
||||||
checkCudaErrors(cudaDeviceGetAttribute(
|
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
|
||||||
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
|
|
||||||
|
|
||||||
minGridSize = multiProcessorCount * numBlocksPerSm;
|
minGridSize = multiProcessorCount * numBlocksPerSm;
|
||||||
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
|
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
|
||||||
|
|
||||||
printf(
|
printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
|
||||||
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
|
|
||||||
"blockSize = %d\n",
|
"blockSize = %d\n",
|
||||||
minGridSize, blockSize);
|
minGridSize,
|
||||||
|
blockSize);
|
||||||
|
|
||||||
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
|
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
|
||||||
|
|
||||||
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
|
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
|
||||||
(void *)&d_partialResults, (void *)&size};
|
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||||
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
|
(void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
|
||||||
dimBlock, kernelArgs, smemSize, stream));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
|
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
|
||||||
cudaMemcpyDeviceToHost, stream));
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
|
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
|
||||||
@ -239,7 +231,8 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
|
|||||||
if ((vecA[i] - expectedResult) > 0.00001) {
|
if ((vecA[i] - expectedResult) > 0.00001) {
|
||||||
printf("mismatch at i = %d\n", i);
|
printf("mismatch at i = %d\n", i);
|
||||||
break;
|
break;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
matches++;
|
matches++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,8 +34,8 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Includes, system
|
// Includes, system
|
||||||
#include <stdio.h>
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// Includes CUDA
|
// Includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@ -58,7 +58,8 @@ bool testResult = true;
|
|||||||
//! Tests assert function.
|
//! Tests assert function.
|
||||||
//! Thread whose id > N will print assertion failed error message.
|
//! Thread whose id > N will print assertion failed error message.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void testKernel(int N) {
|
__global__ void testKernel(int N)
|
||||||
|
{
|
||||||
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
assert(gtid < N);
|
assert(gtid < N);
|
||||||
}
|
}
|
||||||
@ -70,17 +71,18 @@ void runTest(int argc, char **argv);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", sampleName);
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
int Nblocks = 2;
|
int Nblocks = 2;
|
||||||
int Nthreads = 32;
|
int Nthreads = 32;
|
||||||
cudaError_t error;
|
cudaError_t error;
|
||||||
@ -94,7 +96,8 @@ void runTest(int argc, char **argv) {
|
|||||||
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
|
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
|
||||||
printf("simpleAssert is not current supported on Mac OSX\n\n");
|
printf("simpleAssert is not current supported on Mac OSX\n\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("OS Info: <%s>\n\n", OS_System_Type.version);
|
printf("OS Info: <%s>\n\n", OS_System_Type.version);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -118,8 +121,7 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
// Check for errors and failed asserts in asynchronous kernel launch.
|
// Check for errors and failed asserts in asynchronous kernel launch.
|
||||||
if (error == cudaErrorAssert) {
|
if (error == cudaErrorAssert) {
|
||||||
printf(
|
printf("Device assert failed as expected, "
|
||||||
"Device assert failed as expected, "
|
|
||||||
"CUDA error message is: %s\n\n",
|
"CUDA error message is: %s\n\n",
|
||||||
cudaGetErrorString(error));
|
cudaGetErrorString(error));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,11 +34,12 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Includes, system
|
// Includes, system
|
||||||
#include <stdio.h>
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// Includes CUDA
|
// Includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include "nvrtc_helper.h"
|
#include "nvrtc_helper.h"
|
||||||
|
|
||||||
// Utilities and timing functions
|
// Utilities and timing functions
|
||||||
@ -58,7 +59,8 @@ void runTest(int argc, char **argv);
|
|||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", sampleName);
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
@ -66,7 +68,8 @@ int main(int argc, char **argv) {
|
|||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
int Nblocks = 2;
|
int Nblocks = 2;
|
||||||
int Nthreads = 32;
|
int Nthreads = 32;
|
||||||
|
|
||||||
@ -91,10 +94,15 @@ void runTest(int argc, char **argv) {
|
|||||||
int count = 60;
|
int count = 60;
|
||||||
void *args[] = {(void *)&count};
|
void *args[] = {(void *)&count};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */
|
dimGrid.x,
|
||||||
dimBlock.x, dimBlock.y, dimBlock.z, /* block dim */
|
dimGrid.y,
|
||||||
0, 0, /* shared mem, stream */
|
dimGrid.z, /* grid dim */
|
||||||
|
dimBlock.x,
|
||||||
|
dimBlock.y,
|
||||||
|
dimBlock.z, /* block dim */
|
||||||
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
&args[0], /* arguments */
|
&args[0], /* arguments */
|
||||||
0));
|
0));
|
||||||
|
|
||||||
|
|||||||
@ -32,7 +32,8 @@
|
|||||||
//! Thread whose id > N will print assertion failed error message.
|
//! Thread whose id > N will print assertion failed error message.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
extern "C" __global__ void testKernel(int N) {
|
extern "C" __global__ void testKernel(int N)
|
||||||
|
{
|
||||||
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
assert(gtid < N);
|
assert(gtid < N);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,10 +30,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WINDOWS_LEAN_AND_MEAN
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
@ -68,20 +68,21 @@ extern "C" bool computeGold(int *gpuData, const int len);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", sampleName);
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
// This will pick the best possible CUDA capable device
|
// This will pick the best possible CUDA capable device
|
||||||
findCudaDevice(argc, (const char **)argv);
|
findCudaDevice(argc, (const char **)argv);
|
||||||
@ -100,7 +101,8 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMallocHost(&hOData, memSize));
|
checkCudaErrors(cudaMallocHost(&hOData, memSize));
|
||||||
|
|
||||||
// initialize the memory
|
// initialize the memory
|
||||||
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
|
for (unsigned int i = 0; i < numData; i++)
|
||||||
|
hOData[i] = 0;
|
||||||
|
|
||||||
// To make the AND and XOR tests generate something other than 0...
|
// To make the AND and XOR tests generate something other than 0...
|
||||||
hOData[8] = hOData[10] = 0xff;
|
hOData[8] = hOData[10] = 0xff;
|
||||||
@ -110,15 +112,13 @@ void runTest(int argc, char **argv) {
|
|||||||
int *dOData;
|
int *dOData;
|
||||||
checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
|
checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
|
||||||
// copy host memory to device to initialize to zero
|
// copy host memory to device to initialize to zero
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
|
||||||
cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// execute the kernel
|
// execute the kernel
|
||||||
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
|
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
|
||||||
|
|
||||||
// Copy result from device to host
|
// Copy result from device to host
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
|
||||||
cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
|
|||||||
@ -42,7 +42,8 @@ extern "C" int computeGold(int *gpuData, const int len);
|
|||||||
//! @param idata input data as provided to device
|
//! @param idata input data as provided to device
|
||||||
//! @param len number of elements in reference / idata
|
//! @param len number of elements in reference / idata
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int computeGold(int *gpuData, const int len) {
|
int computeGold(int *gpuData, const int len)
|
||||||
|
{
|
||||||
int val = 0;
|
int val = 0;
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
|
|||||||
@ -35,7 +35,8 @@
|
|||||||
//! @param g_idata input data in global memory
|
//! @param g_idata input data in global memory
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void testKernel(int *g_odata) {
|
__global__ void testKernel(int *g_odata)
|
||||||
|
{
|
||||||
// access thread id
|
// access thread id
|
||||||
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
|||||||
@ -30,10 +30,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WINDOWS_LEAN_AND_MEAN
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
@ -64,13 +64,13 @@ extern "C" bool computeGold(int *gpuData, const int len);
|
|||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", sampleName);
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -79,7 +79,8 @@ int main(int argc, char **argv) {
|
|||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
int dev = 0;
|
int dev = 0;
|
||||||
|
|
||||||
char *cubin, *kernel_file;
|
char *cubin, *kernel_file;
|
||||||
@ -106,7 +107,8 @@ void runTest(int argc, char **argv) {
|
|||||||
int *hOData = (int *)malloc(memSize);
|
int *hOData = (int *)malloc(memSize);
|
||||||
|
|
||||||
// initialize the memory
|
// initialize the memory
|
||||||
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0;
|
for (unsigned int i = 0; i < numData; i++)
|
||||||
|
hOData[i] = 0;
|
||||||
|
|
||||||
// To make the AND and XOR tests generate something other than 0...
|
// To make the AND and XOR tests generate something other than 0...
|
||||||
hOData[8] = hOData[10] = 0xff;
|
hOData[8] = hOData[10] = 0xff;
|
||||||
@ -121,11 +123,15 @@ void runTest(int argc, char **argv) {
|
|||||||
dim3 cudaGridSize(numBlocks, 1, 1);
|
dim3 cudaGridSize(numBlocks, 1, 1);
|
||||||
|
|
||||||
void *arr[] = {(void *)&dOData};
|
void *arr[] = {(void *)&dOData};
|
||||||
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
|
cudaGridSize.x,
|
||||||
|
cudaGridSize.y,
|
||||||
cudaGridSize.z, /* grid dim */
|
cudaGridSize.z, /* grid dim */
|
||||||
cudaBlockSize.x, cudaBlockSize.y,
|
cudaBlockSize.x,
|
||||||
|
cudaBlockSize.y,
|
||||||
cudaBlockSize.z, /* block dim */
|
cudaBlockSize.z, /* block dim */
|
||||||
0, 0, /* shared mem, stream */
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
&arr[0], /* arguments */
|
&arr[0], /* arguments */
|
||||||
0));
|
0));
|
||||||
|
|
||||||
|
|||||||
@ -43,7 +43,8 @@ extern "C" int computeGold(int *gpuData, const int len);
|
|||||||
//! @param len number of elements in reference / idata
|
//! @param len number of elements in reference / idata
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
int computeGold(int *gpuData, const int len) {
|
int computeGold(int *gpuData, const int len)
|
||||||
|
{
|
||||||
int val = 0;
|
int val = 0;
|
||||||
|
|
||||||
for (int i = 0; i < len; ++i) {
|
for (int i = 0; i < len; ++i) {
|
||||||
|
|||||||
@ -36,7 +36,8 @@
|
|||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
extern "C" __global__ void testKernel(int *g_odata) {
|
extern "C" __global__ void testKernel(int *g_odata)
|
||||||
|
{
|
||||||
// access thread id
|
// access thread id
|
||||||
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
|||||||
@ -26,10 +26,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes CUDA
|
// includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@ -42,7 +42,8 @@
|
|||||||
// declaration, forward
|
// declaration, forward
|
||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
|
|
||||||
cudaAccessPolicyWindow initAccessPolicyWindow(void) {
|
cudaAccessPolicyWindow initAccessPolicyWindow(void)
|
||||||
|
{
|
||||||
cudaAccessPolicyWindow accessPolicyWindow = {0};
|
cudaAccessPolicyWindow accessPolicyWindow = {0};
|
||||||
accessPolicyWindow.base_ptr = (void *)0;
|
accessPolicyWindow.base_ptr = (void *)0;
|
||||||
accessPolicyWindow.num_bytes = 0;
|
accessPolicyWindow.num_bytes = 0;
|
||||||
@ -60,8 +61,8 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
|
|||||||
//! @param bigDataSize input bigData size
|
//! @param bigDataSize input bigData size
|
||||||
//! @param hitcount how many data access are done within block
|
//! @param hitcount how many data access are done within block
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
|
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
|
||||||
int bigDataSize, int hitCount) {
|
{
|
||||||
__shared__ unsigned int hit;
|
__shared__ unsigned int hit;
|
||||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -82,9 +83,9 @@ static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash,
|
|||||||
|
|
||||||
if ((tID % 2) == 0) {
|
if ((tID % 2) == 0) {
|
||||||
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
|
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
|
||||||
} else {
|
}
|
||||||
trash[psRand % bigDataSize] =
|
else {
|
||||||
trash[psRand % bigDataSize] + trash[idx % bigDataSize];
|
trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
|
||||||
}
|
}
|
||||||
|
|
||||||
atomicAdd(&hit, 1);
|
atomicAdd(&hit, 1);
|
||||||
@ -98,7 +99,8 @@ int main(int argc, char **argv) { runTest(argc, argv); }
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
bool bTestResult = true;
|
bool bTestResult = true;
|
||||||
cudaAccessPolicyWindow accessPolicyWindow;
|
cudaAccessPolicyWindow accessPolicyWindow;
|
||||||
cudaDeviceProp deviceProp;
|
cudaDeviceProp deviceProp;
|
||||||
@ -127,8 +129,7 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
// Make sure device the l2 optimization
|
// Make sure device the l2 optimization
|
||||||
if (deviceProp.persistingL2CacheMaxSize == 0) {
|
if (deviceProp.persistingL2CacheMaxSize == 0) {
|
||||||
printf(
|
printf("Waiving execution as device %d does not support persisting L2 "
|
||||||
"Waiving execution as device %d does not support persisting L2 "
|
|
||||||
"Caching\n",
|
"Caching\n",
|
||||||
devID);
|
devID);
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -139,8 +140,7 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
// Set the amount of l2 cache that will be persisting to maximum the device
|
// Set the amount of l2 cache that will be persisting to maximum the device
|
||||||
// can support
|
// can support
|
||||||
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
|
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
|
||||||
deviceProp.persistingL2CacheMaxSize));
|
|
||||||
|
|
||||||
// Stream attribute to set
|
// Stream attribute to set
|
||||||
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
|
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
|
||||||
@ -155,8 +155,7 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
// Allocate data
|
// Allocate data
|
||||||
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
|
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
|
||||||
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
|
|
||||||
|
|
||||||
for (int i = 0; i < bigDataSize; ++i) {
|
for (int i = 0; i < bigDataSize; ++i) {
|
||||||
if (i < dataSize) {
|
if (i < dataSize) {
|
||||||
@ -166,16 +165,12 @@ void runTest(int argc, char **argv) {
|
|||||||
bigDataHostPointer[bigDataSize - i - 1] = i;
|
bigDataHostPointer[bigDataSize - i - 1] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
|
||||||
|
checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
|
cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(
|
||||||
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
|
bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
|
|
||||||
dataSize * sizeof(int),
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
|
|
||||||
bigDataSize * sizeof(int),
|
|
||||||
cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
// Make a window for the buffer of interest
|
// Make a window for the buffer of interest
|
||||||
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
|
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
|
||||||
@ -186,8 +181,7 @@ void runTest(int argc, char **argv) {
|
|||||||
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
|
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
|
||||||
|
|
||||||
// Assign window to stream
|
// Assign window to stream
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
|
||||||
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
|
|
||||||
|
|
||||||
// Demote any previous persisting lines
|
// Demote any previous persisting lines
|
||||||
checkCudaErrors(cudaCtxResetPersistingL2Cache());
|
checkCudaErrors(cudaCtxResetPersistingL2Cache());
|
||||||
|
|||||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
|
|||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -50,8 +50,8 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// CUDA includes
|
// CUDA includes
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_gl_interop.h>
|
#include <cuda_gl_interop.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// CUDA utilities and system includes
|
// CUDA utilities and system includes
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
@ -124,8 +124,7 @@ StopWatchInterface *timer = NULL;
|
|||||||
GLuint shDraw;
|
GLuint shDraw;
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
|
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw);
|
||||||
unsigned int *g_odata, int imgw);
|
|
||||||
|
|
||||||
// Forward declarations
|
// Forward declarations
|
||||||
void runStdProgram(int argc, char **argv);
|
void runStdProgram(int argc, char **argv);
|
||||||
@ -140,8 +139,7 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource);
|
|||||||
void deletePBO(GLuint *pbo);
|
void deletePBO(GLuint *pbo);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
|
void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y);
|
||||||
unsigned int size_y);
|
|
||||||
void deleteTexture(GLuint *tex);
|
void deleteTexture(GLuint *tex);
|
||||||
|
|
||||||
// rendering callbacks
|
// rendering callbacks
|
||||||
@ -155,7 +153,8 @@ void mainMenu(int i);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Create PBO
|
//! Create PBO
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) {
|
void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource)
|
||||||
|
{
|
||||||
// set up vertex data parameter
|
// set up vertex data parameter
|
||||||
num_texels = image_width * image_height;
|
num_texels = image_width * image_height;
|
||||||
num_values = num_texels * 4;
|
num_values = num_texels * 4;
|
||||||
@ -171,33 +170,32 @@ void createPBO(GLuint *pbo, struct cudaGraphicsResource **pbo_resource) {
|
|||||||
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
glBindBuffer(GL_ARRAY_BUFFER, 0);
|
||||||
|
|
||||||
// register this buffer object with CUDA
|
// register this buffer object with CUDA
|
||||||
checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo,
|
checkCudaErrors(cudaGraphicsGLRegisterBuffer(pbo_resource, *pbo, cudaGraphicsMapFlagsNone));
|
||||||
cudaGraphicsMapFlagsNone));
|
|
||||||
|
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
}
|
}
|
||||||
|
|
||||||
void deletePBO(GLuint *pbo) {
|
void deletePBO(GLuint *pbo)
|
||||||
|
{
|
||||||
glDeleteBuffers(1, pbo);
|
glDeleteBuffers(1, pbo);
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
*pbo = 0;
|
*pbo = 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const GLenum fbo_targets[] = {
|
const GLenum fbo_targets[] = {GL_COLOR_ATTACHMENT0_EXT,
|
||||||
GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT,
|
GL_COLOR_ATTACHMENT1_EXT,
|
||||||
GL_COLOR_ATTACHMENT2_EXT, GL_COLOR_ATTACHMENT3_EXT};
|
GL_COLOR_ATTACHMENT2_EXT,
|
||||||
|
GL_COLOR_ATTACHMENT3_EXT};
|
||||||
|
|
||||||
#ifndef USE_TEXSUBIMAGE2D
|
#ifndef USE_TEXSUBIMAGE2D
|
||||||
static const char *glsl_drawtex_vertshader_src =
|
static const char *glsl_drawtex_vertshader_src = "void main(void)\n"
|
||||||
"void main(void)\n"
|
|
||||||
"{\n"
|
"{\n"
|
||||||
" gl_Position = gl_Vertex;\n"
|
" gl_Position = gl_Vertex;\n"
|
||||||
" gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n"
|
" gl_TexCoord[0].xy = gl_MultiTexCoord0.xy;\n"
|
||||||
"}\n";
|
"}\n";
|
||||||
|
|
||||||
static const char *glsl_drawtex_fragshader_src =
|
static const char *glsl_drawtex_fragshader_src = "#version 130\n"
|
||||||
"#version 130\n"
|
|
||||||
"uniform usampler2D texImage;\n"
|
"uniform usampler2D texImage;\n"
|
||||||
"void main()\n"
|
"void main()\n"
|
||||||
"{\n"
|
"{\n"
|
||||||
@ -227,15 +225,15 @@ static const char *glsl_draw_fragshader_src =
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// copy image and process using CUDA
|
// copy image and process using CUDA
|
||||||
void generateCUDAImage() {
|
void generateCUDAImage()
|
||||||
|
{
|
||||||
// run the Cuda kernel
|
// run the Cuda kernel
|
||||||
unsigned int *out_data;
|
unsigned int *out_data;
|
||||||
|
|
||||||
#ifdef USE_TEXSUBIMAGE2D
|
#ifdef USE_TEXSUBIMAGE2D
|
||||||
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0));
|
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_dest_resource, 0));
|
||||||
size_t num_bytes;
|
size_t num_bytes;
|
||||||
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
|
checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
|
||||||
(void **)&out_data, &num_bytes, cuda_pbo_dest_resource));
|
|
||||||
// printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n",
|
// printf("CUDA mapped pointer of pbo_out: May access %ld bytes, expected %d\n",
|
||||||
// num_bytes, size_tex_data);
|
// num_bytes, size_tex_data);
|
||||||
#else
|
#else
|
||||||
@ -258,8 +256,7 @@ void generateCUDAImage() {
|
|||||||
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo_dest);
|
||||||
|
|
||||||
glBindTexture(GL_TEXTURE_2D, tex_cudaResult);
|
glBindTexture(GL_TEXTURE_2D, tex_cudaResult);
|
||||||
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA,
|
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, image_width, image_height, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
|
||||||
GL_UNSIGNED_BYTE, NULL);
|
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
|
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
|
||||||
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
|
||||||
@ -268,21 +265,20 @@ void generateCUDAImage() {
|
|||||||
// map buffer objects to get CUDA device pointers
|
// map buffer objects to get CUDA device pointers
|
||||||
cudaArray *texture_ptr;
|
cudaArray *texture_ptr;
|
||||||
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0));
|
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_tex_result_resource, 0));
|
||||||
checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(
|
checkCudaErrors(cudaGraphicsSubResourceGetMappedArray(&texture_ptr, cuda_tex_result_resource, 0, 0));
|
||||||
&texture_ptr, cuda_tex_result_resource, 0, 0));
|
|
||||||
|
|
||||||
int num_texels = image_width * image_height;
|
int num_texels = image_width * image_height;
|
||||||
int num_values = num_texels * 4;
|
int num_values = num_texels * 4;
|
||||||
int size_tex_data = sizeof(GLubyte) * num_values;
|
int size_tex_data = sizeof(GLubyte) * num_values;
|
||||||
checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource,
|
checkCudaErrors(cudaMemcpyToArray(texture_ptr, 0, 0, cuda_dest_resource, size_tex_data, cudaMemcpyDeviceToDevice));
|
||||||
size_tex_data, cudaMemcpyDeviceToDevice));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0));
|
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_tex_result_resource, 0));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
// display image to the screen as textured quad
|
// display image to the screen as textured quad
|
||||||
void displayImage(GLuint texture) {
|
void displayImage(GLuint texture)
|
||||||
|
{
|
||||||
glBindTexture(GL_TEXTURE_2D, texture);
|
glBindTexture(GL_TEXTURE_2D, texture);
|
||||||
glEnable(GL_TEXTURE_2D);
|
glEnable(GL_TEXTURE_2D);
|
||||||
glDisable(GL_DEPTH_TEST);
|
glDisable(GL_DEPTH_TEST);
|
||||||
@ -332,7 +328,8 @@ void displayImage(GLuint texture) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Display callback
|
//! Display callback
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void display() {
|
void display()
|
||||||
|
{
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
if (enable_cuda) {
|
if (enable_cuda) {
|
||||||
@ -358,9 +355,7 @@ void display() {
|
|||||||
sprintf(currentOutputPPM, "kilt.ppm");
|
sprintf(currentOutputPPM, "kilt.ppm");
|
||||||
g_CheckRender->savePPM(currentOutputPPM, true, NULL);
|
g_CheckRender->savePPM(currentOutputPPM, true, NULL);
|
||||||
|
|
||||||
if (!g_CheckRender->PPMvsPPM(currentOutputPPM,
|
if (!g_CheckRender->PPMvsPPM(currentOutputPPM, sdkFindFilePath(ref_file, pArgv[0]), MAX_EPSILON, 0.30f)) {
|
||||||
sdkFindFilePath(ref_file, pArgv[0]),
|
|
||||||
MAX_EPSILON, 0.30f)) {
|
|
||||||
g_TotalErrors++;
|
g_TotalErrors++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -374,8 +369,7 @@ void display() {
|
|||||||
if (++fpsCount == fpsLimit) {
|
if (++fpsCount == fpsLimit) {
|
||||||
char cTitle[256];
|
char cTitle[256];
|
||||||
float fps = 1000.0f / sdkGetAverageTimerValue(&timer);
|
float fps = 1000.0f / sdkGetAverageTimerValue(&timer);
|
||||||
sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width,
|
sprintf(cTitle, "CUDA GL Post Processing (%d x %d): %.1f fps", window_width, window_height, fps);
|
||||||
window_height, fps);
|
|
||||||
glutSetWindowTitle(cTitle);
|
glutSetWindowTitle(cTitle);
|
||||||
// printf("%s\n", cTitle);
|
// printf("%s\n", cTitle);
|
||||||
fpsCount = 0;
|
fpsCount = 0;
|
||||||
@ -384,7 +378,8 @@ void display() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void timerEvent(int value) {
|
void timerEvent(int value)
|
||||||
|
{
|
||||||
glutPostRedisplay();
|
glutPostRedisplay();
|
||||||
glutTimerFunc(REFRESH_DELAY, timerEvent, 0);
|
glutTimerFunc(REFRESH_DELAY, timerEvent, 0);
|
||||||
}
|
}
|
||||||
@ -392,7 +387,8 @@ void timerEvent(int value) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Keyboard events handler
|
//! Keyboard events handler
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void keyboard(unsigned char key, int /*x*/, int /*y*/) {
|
void keyboard(unsigned char key, int /*x*/, int /*y*/)
|
||||||
|
{
|
||||||
switch (key) {
|
switch (key) {
|
||||||
case (27):
|
case (27):
|
||||||
Cleanup(EXIT_SUCCESS);
|
Cleanup(EXIT_SUCCESS);
|
||||||
@ -404,7 +400,8 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) {
|
|||||||
|
|
||||||
if (enable_cuda) {
|
if (enable_cuda) {
|
||||||
glClearColorIuiEXT(128, 128, 128, 255);
|
glClearColorIuiEXT(128, 128, 128, 255);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
glClearColor(0.5, 0.5, 0.5, 1.0);
|
glClearColor(0.5, 0.5, 0.5, 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -413,7 +410,8 @@ void keyboard(unsigned char key, int /*x*/, int /*y*/) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void reshape(int w, int h) {
|
void reshape(int w, int h)
|
||||||
|
{
|
||||||
window_width = w;
|
window_width = w;
|
||||||
window_height = h;
|
window_height = h;
|
||||||
}
|
}
|
||||||
@ -423,8 +421,8 @@ void mainMenu(int i) { keyboard((unsigned char)i, 0, 0); }
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//!
|
//!
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
|
void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x, unsigned int size_y)
|
||||||
unsigned int size_y) {
|
{
|
||||||
// create a texture
|
// create a texture
|
||||||
glGenTextures(1, tex_cudaResult);
|
glGenTextures(1, tex_cudaResult);
|
||||||
glBindTexture(GL_TEXTURE_2D, *tex_cudaResult);
|
glBindTexture(GL_TEXTURE_2D, *tex_cudaResult);
|
||||||
@ -436,24 +434,22 @@ void createTextureDst(GLuint *tex_cudaResult, unsigned int size_x,
|
|||||||
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
|
||||||
|
|
||||||
#ifdef USE_TEXSUBIMAGE2D
|
#ifdef USE_TEXSUBIMAGE2D
|
||||||
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA,
|
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, size_x, size_y, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
|
||||||
GL_UNSIGNED_BYTE, NULL);
|
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
#else
|
#else
|
||||||
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0,
|
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8UI_EXT, size_x, size_y, 0, GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
|
||||||
GL_RGBA_INTEGER_EXT, GL_UNSIGNED_BYTE, NULL);
|
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
// register this texture with CUDA
|
// register this texture with CUDA
|
||||||
checkCudaErrors(cudaGraphicsGLRegisterImage(
|
checkCudaErrors(cudaGraphicsGLRegisterImage(
|
||||||
&cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D,
|
&cuda_tex_result_resource, *tex_cudaResult, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard));
|
||||||
cudaGraphicsMapFlagsWriteDiscard));
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//!
|
//!
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void deleteTexture(GLuint *tex) {
|
void deleteTexture(GLuint *tex)
|
||||||
|
{
|
||||||
glDeleteTextures(1, tex);
|
glDeleteTextures(1, tex);
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
|
|
||||||
@ -463,7 +459,8 @@ void deleteTexture(GLuint *tex) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
#if defined(__linux__)
|
#if defined(__linux__)
|
||||||
char *Xstatus = getenv("DISPLAY");
|
char *Xstatus = getenv("DISPLAY");
|
||||||
if (Xstatus == NULL) {
|
if (Xstatus == NULL) {
|
||||||
@ -487,8 +484,7 @@ int main(int argc, char **argv) {
|
|||||||
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||||||
printf("[%s]\n", argv[0]);
|
printf("[%s]\n", argv[0]);
|
||||||
printf(" Does not explicitly support -device=n\n");
|
printf(" Does not explicitly support -device=n\n");
|
||||||
printf(
|
printf(" This sample requires OpenGL. Only -file=<reference> are "
|
||||||
" This sample requires OpenGL. Only -file=<reference> are "
|
|
||||||
"supported\n");
|
"supported\n");
|
||||||
printf("exiting...\n");
|
printf("exiting...\n");
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -497,7 +493,8 @@ int main(int argc, char **argv) {
|
|||||||
if (ref_file) {
|
if (ref_file) {
|
||||||
printf("(Test with OpenGL verification)\n");
|
printf("(Test with OpenGL verification)\n");
|
||||||
runStdProgram(argc, argv);
|
runStdProgram(argc, argv);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("(Interactive OpenGL Demo)\n");
|
printf("(Interactive OpenGL Demo)\n");
|
||||||
runStdProgram(argc, argv);
|
runStdProgram(argc, argv);
|
||||||
}
|
}
|
||||||
@ -508,7 +505,8 @@ int main(int argc, char **argv) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//!
|
//!
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void FreeResource() {
|
void FreeResource()
|
||||||
|
{
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// unregister this buffer object with CUDA
|
// unregister this buffer object with CUDA
|
||||||
@ -530,18 +528,18 @@ void FreeResource() {
|
|||||||
printf("simpleCUDA2GL Exiting...\n");
|
printf("simpleCUDA2GL Exiting...\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void Cleanup(int iExitCode) {
|
void Cleanup(int iExitCode)
|
||||||
|
{
|
||||||
FreeResource();
|
FreeResource();
|
||||||
printf("PPM Images are %s\n",
|
printf("PPM Images are %s\n", (iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
|
||||||
(iExitCode == EXIT_SUCCESS) ? "Matching" : "Not Matching");
|
|
||||||
exit(iExitCode);
|
exit(iExitCode);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//!
|
//!
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
GLuint compileGLSLprogram(const char *vertex_shader_src,
|
GLuint compileGLSLprogram(const char *vertex_shader_src, const char *fragment_shader_src)
|
||||||
const char *fragment_shader_src) {
|
{
|
||||||
GLuint v, f, p = 0;
|
GLuint v, f, p = 0;
|
||||||
|
|
||||||
p = glCreateProgram();
|
p = glCreateProgram();
|
||||||
@ -563,7 +561,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
|
|||||||
// #endif
|
// #endif
|
||||||
glDeleteShader(v);
|
glDeleteShader(v);
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
glAttachShader(p, v);
|
glAttachShader(p, v);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -585,7 +584,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
|
|||||||
// #endif
|
// #endif
|
||||||
glDeleteShader(f);
|
glDeleteShader(f);
|
||||||
return 0;
|
return 0;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
glAttachShader(p, f);
|
glAttachShader(p, f);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -611,7 +611,8 @@ GLuint compileGLSLprogram(const char *vertex_shader_src,
|
|||||||
//! Allocate the "render target" of CUDA
|
//! Allocate the "render target" of CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
#ifndef USE_TEXSUBIMAGE2D
|
#ifndef USE_TEXSUBIMAGE2D
|
||||||
void initCUDABuffers() {
|
void initCUDABuffers()
|
||||||
|
{
|
||||||
// set up vertex data parameter
|
// set up vertex data parameter
|
||||||
num_texels = image_width * image_height;
|
num_texels = image_width * image_height;
|
||||||
num_values = num_texels * 4;
|
num_values = num_texels * 4;
|
||||||
@ -625,7 +626,8 @@ void initCUDABuffers() {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//!
|
//!
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void initGLBuffers() {
|
void initGLBuffers()
|
||||||
|
{
|
||||||
// create pbo
|
// create pbo
|
||||||
#ifdef USE_TEXSUBIMAGE2D
|
#ifdef USE_TEXSUBIMAGE2D
|
||||||
createPBO(&pbo_dest, &cuda_pbo_dest_resource);
|
createPBO(&pbo_dest, &cuda_pbo_dest_resource);
|
||||||
@ -636,8 +638,7 @@ void initGLBuffers() {
|
|||||||
shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src);
|
shDraw = compileGLSLprogram(NULL, glsl_draw_fragshader_src);
|
||||||
|
|
||||||
#ifndef USE_TEXSUBIMAGE2D
|
#ifndef USE_TEXSUBIMAGE2D
|
||||||
shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src,
|
shDrawTex = compileGLSLprogram(glsl_drawtex_vertshader_src, glsl_drawtex_fragshader_src);
|
||||||
glsl_drawtex_fragshader_src);
|
|
||||||
#endif
|
#endif
|
||||||
SDK_CHECK_ERROR_GL();
|
SDK_CHECK_ERROR_GL();
|
||||||
}
|
}
|
||||||
@ -645,7 +646,8 @@ void initGLBuffers() {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run standard demo loop with or without GL verification
|
//! Run standard demo loop with or without GL verification
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runStdProgram(int argc, char **argv) {
|
void runStdProgram(int argc, char **argv)
|
||||||
|
{
|
||||||
// First initialize OpenGL context, so we can properly set the GL for CUDA.
|
// First initialize OpenGL context, so we can properly set the GL for CUDA.
|
||||||
// This is necessary in order to achieve optimal performance with OpenGL/CUDA
|
// This is necessary in order to achieve optimal performance with OpenGL/CUDA
|
||||||
// interop.
|
// interop.
|
||||||
@ -683,8 +685,7 @@ void runStdProgram(int argc, char **argv) {
|
|||||||
g_CheckRender->EnableQAReadback(true);
|
g_CheckRender->EnableQAReadback(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(
|
printf("\n"
|
||||||
"\n"
|
|
||||||
"\tControls\n"
|
"\tControls\n"
|
||||||
"\t(right click mouse button for Menu)\n"
|
"\t(right click mouse button for Menu)\n"
|
||||||
"\t[esc] - Quit\n\n");
|
"\t[esc] - Quit\n\n");
|
||||||
@ -699,7 +700,8 @@ void runStdProgram(int argc, char **argv) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Initialize GL
|
//! Initialize GL
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
bool initGL(int *argc, char **argv) {
|
bool initGL(int *argc, char **argv)
|
||||||
|
{
|
||||||
// Create GL context
|
// Create GL context
|
||||||
glutInit(argc, argv);
|
glutInit(argc, argv);
|
||||||
glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH);
|
glutInitDisplayMode(GLUT_RGBA | GLUT_ALPHA | GLUT_DOUBLE | GLUT_DEPTH);
|
||||||
@ -707,8 +709,8 @@ bool initGL(int *argc, char **argv) {
|
|||||||
iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing");
|
iGLUTWindowHandle = glutCreateWindow("CUDA OpenGL post-processing");
|
||||||
|
|
||||||
// initialize necessary OpenGL extensions
|
// initialize necessary OpenGL extensions
|
||||||
if (!isGLVersionSupported(2, 0) ||
|
if (!isGLVersionSupported(2, 0)
|
||||||
!areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
|
|| !areGLExtensionsSupported("GL_ARB_pixel_buffer_object "
|
||||||
"GL_EXT_framebuffer_object")) {
|
"GL_EXT_framebuffer_object")) {
|
||||||
printf("ERROR: Support for necessary OpenGL extensions missing.");
|
printf("ERROR: Support for necessary OpenGL extensions missing.");
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
@ -729,8 +731,7 @@ bool initGL(int *argc, char **argv) {
|
|||||||
// projection
|
// projection
|
||||||
glMatrixMode(GL_PROJECTION);
|
glMatrixMode(GL_PROJECTION);
|
||||||
glLoadIdentity();
|
glLoadIdentity();
|
||||||
gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f,
|
gluPerspective(60.0, (GLfloat)window_width / (GLfloat)window_height, 0.1f, 10.0f);
|
||||||
10.0f);
|
|
||||||
|
|
||||||
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
|
glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
|
||||||
|
|
||||||
|
|||||||
@ -35,14 +35,16 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
|
|||||||
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
|
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
|
||||||
|
|
||||||
// convert floating point rgb color to 8-bit integer
|
// convert floating point rgb color to 8-bit integer
|
||||||
__device__ int rgbToInt(float r, float g, float b) {
|
__device__ int rgbToInt(float r, float g, float b)
|
||||||
|
{
|
||||||
r = clamp(r, 0.0f, 255.0f);
|
r = clamp(r, 0.0f, 255.0f);
|
||||||
g = clamp(g, 0.0f, 255.0f);
|
g = clamp(g, 0.0f, 255.0f);
|
||||||
b = clamp(b, 0.0f, 255.0f);
|
b = clamp(b, 0.0f, 255.0f);
|
||||||
return (int(b) << 16) | (int(g) << 8) | int(r);
|
return (int(b) << 16) | (int(g) << 8) | int(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void cudaProcess(unsigned int *g_odata, int imgw) {
|
__global__ void cudaProcess(unsigned int *g_odata, int imgw)
|
||||||
|
{
|
||||||
extern __shared__ uchar4 sdata[];
|
extern __shared__ uchar4 sdata[];
|
||||||
|
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
@ -56,7 +58,7 @@ __global__ void cudaProcess(unsigned int *g_odata, int imgw) {
|
|||||||
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
|
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes,
|
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
|
||||||
unsigned int *g_odata, int imgw) {
|
{
|
||||||
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
|
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,18 +29,21 @@
|
|||||||
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
// Create thread
|
// Create thread
|
||||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
|
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
|
||||||
|
{
|
||||||
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
|
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for thread to finish
|
// Wait for thread to finish
|
||||||
void cutEndThread(CUTThread thread) {
|
void cutEndThread(CUTThread thread)
|
||||||
|
{
|
||||||
WaitForSingleObject(thread, INFINITE);
|
WaitForSingleObject(thread, INFINITE);
|
||||||
CloseHandle(thread);
|
CloseHandle(thread);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for multiple threads
|
// Wait for multiple threads
|
||||||
void cutWaitForThreads(const CUTThread *threads, int num) {
|
void cutWaitForThreads(const CUTThread *threads, int num)
|
||||||
|
{
|
||||||
WaitForMultipleObjects(num, threads, true, INFINITE);
|
WaitForMultipleObjects(num, threads, true, INFINITE);
|
||||||
|
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
@ -49,7 +52,8 @@ void cutWaitForThreads(const CUTThread *threads, int num) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Create barrier.
|
// Create barrier.
|
||||||
CUTBarrier cutCreateBarrier(int releaseCount) {
|
CUTBarrier cutCreateBarrier(int releaseCount)
|
||||||
|
{
|
||||||
CUTBarrier barrier;
|
CUTBarrier barrier;
|
||||||
|
|
||||||
InitializeCriticalSection(&barrier.criticalSection);
|
InitializeCriticalSection(&barrier.criticalSection);
|
||||||
@ -61,7 +65,8 @@ CUTBarrier cutCreateBarrier(int releaseCount) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Increment barrier. (execution continues)
|
// Increment barrier. (execution continues)
|
||||||
void cutIncrementBarrier(CUTBarrier *barrier) {
|
void cutIncrementBarrier(CUTBarrier *barrier)
|
||||||
|
{
|
||||||
int myBarrierCount;
|
int myBarrierCount;
|
||||||
EnterCriticalSection(&barrier->criticalSection);
|
EnterCriticalSection(&barrier->criticalSection);
|
||||||
myBarrierCount = ++barrier->count;
|
myBarrierCount = ++barrier->count;
|
||||||
@ -73,16 +78,15 @@ void cutIncrementBarrier(CUTBarrier *barrier) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Wait for barrier release.
|
// Wait for barrier release.
|
||||||
void cutWaitForBarrier(CUTBarrier *barrier) {
|
void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
|
||||||
WaitForSingleObject(barrier->barrierEvent, INFINITE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Destroy barrier
|
// Destroy barrier
|
||||||
void cutDestroyBarrier(CUTBarrier *barrier) {}
|
void cutDestroyBarrier(CUTBarrier *barrier) {}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// Create thread
|
// Create thread
|
||||||
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
|
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
|
||||||
|
{
|
||||||
pthread_t thread;
|
pthread_t thread;
|
||||||
pthread_create(&thread, NULL, func, data);
|
pthread_create(&thread, NULL, func, data);
|
||||||
return thread;
|
return thread;
|
||||||
@ -92,14 +96,16 @@ CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) {
|
|||||||
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
|
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
|
||||||
|
|
||||||
// Wait for multiple threads
|
// Wait for multiple threads
|
||||||
void cutWaitForThreads(const CUTThread *threads, int num) {
|
void cutWaitForThreads(const CUTThread *threads, int num)
|
||||||
|
{
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
cutEndThread(threads[i]);
|
cutEndThread(threads[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create barrier.
|
// Create barrier.
|
||||||
CUTBarrier cutCreateBarrier(int releaseCount) {
|
CUTBarrier cutCreateBarrier(int releaseCount)
|
||||||
|
{
|
||||||
CUTBarrier barrier;
|
CUTBarrier barrier;
|
||||||
|
|
||||||
barrier.count = 0;
|
barrier.count = 0;
|
||||||
@ -112,7 +118,8 @@ CUTBarrier cutCreateBarrier(int releaseCount) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Increment barrier. (execution continues)
|
// Increment barrier. (execution continues)
|
||||||
void cutIncrementBarrier(CUTBarrier *barrier) {
|
void cutIncrementBarrier(CUTBarrier *barrier)
|
||||||
|
{
|
||||||
int myBarrierCount;
|
int myBarrierCount;
|
||||||
pthread_mutex_lock(&barrier->mutex);
|
pthread_mutex_lock(&barrier->mutex);
|
||||||
myBarrierCount = ++barrier->count;
|
myBarrierCount = ++barrier->count;
|
||||||
@ -124,7 +131,8 @@ void cutIncrementBarrier(CUTBarrier *barrier) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Wait for barrier release.
|
// Wait for barrier release.
|
||||||
void cutWaitForBarrier(CUTBarrier *barrier) {
|
void cutWaitForBarrier(CUTBarrier *barrier)
|
||||||
|
{
|
||||||
pthread_mutex_lock(&barrier->mutex);
|
pthread_mutex_lock(&barrier->mutex);
|
||||||
|
|
||||||
while (barrier->count < barrier->releaseCount) {
|
while (barrier->count < barrier->releaseCount) {
|
||||||
@ -135,7 +143,8 @@ void cutWaitForBarrier(CUTBarrier *barrier) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Destroy barrier
|
// Destroy barrier
|
||||||
void cutDestroyBarrier(CUTBarrier *barrier) {
|
void cutDestroyBarrier(CUTBarrier *barrier)
|
||||||
|
{
|
||||||
pthread_mutex_destroy(&barrier->mutex);
|
pthread_mutex_destroy(&barrier->mutex);
|
||||||
pthread_cond_destroy(&barrier->conditionVariable);
|
pthread_cond_destroy(&barrier->conditionVariable);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -37,7 +37,8 @@
|
|||||||
typedef HANDLE CUTThread;
|
typedef HANDLE CUTThread;
|
||||||
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
|
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
|
||||||
|
|
||||||
struct CUTBarrier {
|
struct CUTBarrier
|
||||||
|
{
|
||||||
CRITICAL_SECTION criticalSection;
|
CRITICAL_SECTION criticalSection;
|
||||||
HANDLE barrierEvent;
|
HANDLE barrierEvent;
|
||||||
int releaseCount;
|
int releaseCount;
|
||||||
@ -57,7 +58,8 @@ typedef void *(*CUT_THREADROUTINE)(void *);
|
|||||||
#define CUT_THREADPROC void *
|
#define CUT_THREADPROC void *
|
||||||
#define CUT_THREADEND return 0
|
#define CUT_THREADEND return 0
|
||||||
|
|
||||||
struct CUTBarrier {
|
struct CUTBarrier
|
||||||
|
{
|
||||||
pthread_mutex_t mutex;
|
pthread_mutex_t mutex;
|
||||||
pthread_cond_t conditionVariable;
|
pthread_cond_t conditionVariable;
|
||||||
int releaseCount;
|
int releaseCount;
|
||||||
@ -67,7 +69,8 @@ struct CUTBarrier {
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Create thread.
|
// Create thread.
|
||||||
|
|||||||
@ -43,8 +43,8 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#include "multithreading.h"
|
#include "multithreading.h"
|
||||||
|
|
||||||
@ -53,10 +53,10 @@ const int N_elements_per_workload = 100000;
|
|||||||
|
|
||||||
CUTBarrier thread_barrier;
|
CUTBarrier thread_barrier;
|
||||||
|
|
||||||
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status,
|
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
|
||||||
void *data);
|
|
||||||
|
|
||||||
struct heterogeneous_workload {
|
struct heterogeneous_workload
|
||||||
|
{
|
||||||
int id;
|
int id;
|
||||||
int cudaDeviceID;
|
int cudaDeviceID;
|
||||||
|
|
||||||
@ -67,13 +67,16 @@ struct heterogeneous_workload {
|
|||||||
bool success;
|
bool success;
|
||||||
};
|
};
|
||||||
|
|
||||||
__global__ void incKernel(int *data, int N) {
|
__global__ void incKernel(int *data, int N)
|
||||||
|
{
|
||||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < N) data[i]++;
|
if (i < N)
|
||||||
|
data[i]++;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUT_THREADPROC launch(void *void_arg) {
|
CUT_THREADPROC launch(void *void_arg)
|
||||||
|
{
|
||||||
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
||||||
|
|
||||||
// Select GPU for this CPU thread
|
// Select GPU for this CPU thread
|
||||||
@ -81,11 +84,8 @@ CUT_THREADPROC launch(void *void_arg) {
|
|||||||
|
|
||||||
// Allocate Resources
|
// Allocate Resources
|
||||||
checkCudaErrors(cudaStreamCreate(&workload->stream));
|
checkCudaErrors(cudaStreamCreate(&workload->stream));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
|
||||||
cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
|
checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
|
||||||
checkCudaErrors(cudaHostAlloc(&workload->h_data,
|
|
||||||
N_elements_per_workload * sizeof(int),
|
|
||||||
cudaHostAllocPortable));
|
|
||||||
|
|
||||||
// CPU thread generates data
|
// CPU thread generates data
|
||||||
for (int i = 0; i < N_elements_per_workload; ++i) {
|
for (int i = 0; i < N_elements_per_workload; ++i) {
|
||||||
@ -97,25 +97,28 @@ CUT_THREADPROC launch(void *void_arg) {
|
|||||||
dim3 block(512);
|
dim3 block(512);
|
||||||
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
|
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
|
checkCudaErrors(cudaMemcpyAsync(workload->d_data,
|
||||||
|
workload->h_data,
|
||||||
N_elements_per_workload * sizeof(int),
|
N_elements_per_workload * sizeof(int),
|
||||||
cudaMemcpyHostToDevice, workload->stream));
|
cudaMemcpyHostToDevice,
|
||||||
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
|
workload->stream));
|
||||||
N_elements_per_workload);
|
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
|
||||||
checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
|
checkCudaErrors(cudaMemcpyAsync(workload->h_data,
|
||||||
|
workload->d_data,
|
||||||
N_elements_per_workload * sizeof(int),
|
N_elements_per_workload * sizeof(int),
|
||||||
cudaMemcpyDeviceToHost, workload->stream));
|
cudaMemcpyDeviceToHost,
|
||||||
|
workload->stream));
|
||||||
|
|
||||||
// New in CUDA 5.0: Add a CPU callback which is called once all currently
|
// New in CUDA 5.0: Add a CPU callback which is called once all currently
|
||||||
// pending operations in the CUDA stream have finished
|
// pending operations in the CUDA stream have finished
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
|
||||||
cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
|
|
||||||
|
|
||||||
CUT_THREADEND;
|
CUT_THREADEND;
|
||||||
// CPU thread end of life, GPU continues to process data...
|
// CPU thread end of life, GPU continues to process data...
|
||||||
}
|
}
|
||||||
|
|
||||||
CUT_THREADPROC postprocess(void *void_arg) {
|
CUT_THREADPROC postprocess(void *void_arg)
|
||||||
|
{
|
||||||
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
|
||||||
// ... GPU is done with processing, continue on new CPU thread...
|
// ... GPU is done with processing, continue on new CPU thread...
|
||||||
|
|
||||||
@ -140,8 +143,8 @@ CUT_THREADPROC postprocess(void *void_arg) {
|
|||||||
CUT_THREADEND;
|
CUT_THREADEND;
|
||||||
}
|
}
|
||||||
|
|
||||||
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
|
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
|
||||||
void *data) {
|
{
|
||||||
// Check status of GPU after stream operations are done
|
// Check status of GPU after stream operations are done
|
||||||
checkCudaErrors(status);
|
checkCudaErrors(status);
|
||||||
|
|
||||||
@ -149,7 +152,8 @@ void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
|
|||||||
cutStartThread(postprocess, data);
|
cutStartThread(postprocess, data);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int N_gpus, max_gpus = 0;
|
int N_gpus, max_gpus = 0;
|
||||||
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
|
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
|
||||||
|
|
||||||
@ -168,10 +172,8 @@ int main(int argc, char **argv) {
|
|||||||
cudaSetDevice(devid);
|
cudaSetDevice(devid);
|
||||||
cudaGetDeviceProperties(&deviceProp, devid);
|
cudaGetDeviceProperties(&deviceProp, devid);
|
||||||
SMversion = deviceProp.major << 4 + deviceProp.minor;
|
SMversion = deviceProp.major << 4 + deviceProp.minor;
|
||||||
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
|
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
|
||||||
deviceProp.major, deviceProp.minor);
|
printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
|
||||||
printf(", %s GPU Callback Functions\n",
|
|
||||||
(SMversion >= 0x11) ? "capable" : "NOT capable");
|
|
||||||
|
|
||||||
if (SMversion >= 0x11) {
|
if (SMversion >= 0x11) {
|
||||||
gpuInfo[max_gpus++] = devid;
|
gpuInfo[max_gpus++] = devid;
|
||||||
@ -181,8 +183,7 @@ int main(int argc, char **argv) {
|
|||||||
printf("%d GPUs available to run Callback Functions\n", max_gpus);
|
printf("%d GPUs available to run Callback Functions\n", max_gpus);
|
||||||
|
|
||||||
heterogeneous_workload *workloads;
|
heterogeneous_workload *workloads;
|
||||||
workloads = (heterogeneous_workload *)malloc(N_workloads *
|
workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
|
||||||
sizeof(heterogeneous_workload));
|
|
||||||
;
|
;
|
||||||
thread_barrier = cutCreateBarrier(N_workloads);
|
thread_barrier = cutCreateBarrier(N_workloads);
|
||||||
|
|
||||||
|
|||||||
@ -38,8 +38,8 @@
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <cooperative_groups.h>
|
#include <cooperative_groups.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
using namespace cooperative_groups;
|
using namespace cooperative_groups;
|
||||||
|
|
||||||
@ -49,7 +49,8 @@ using namespace cooperative_groups;
|
|||||||
* calculates the sum of val across the group g. The workspace array, x,
|
* calculates the sum of val across the group g. The workspace array, x,
|
||||||
* must be large enough to contain g.size() integers.
|
* must be large enough to contain g.size() integers.
|
||||||
*/
|
*/
|
||||||
__device__ int sumReduction(thread_group g, int *x, int val) {
|
__device__ int sumReduction(thread_group g, int *x, int val)
|
||||||
|
{
|
||||||
// rank of this thread in the group
|
// rank of this thread in the group
|
||||||
int lane = g.thread_rank();
|
int lane = g.thread_rank();
|
||||||
|
|
||||||
@ -85,7 +86,8 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
|
|||||||
*
|
*
|
||||||
* Creates cooperative groups and performs reductions
|
* Creates cooperative groups and performs reductions
|
||||||
*/
|
*/
|
||||||
__global__ void cgkernel() {
|
__global__ void cgkernel()
|
||||||
|
{
|
||||||
// threadBlockGroup includes all threads in the block
|
// threadBlockGroup includes all threads in the block
|
||||||
thread_block threadBlockGroup = this_thread_block();
|
thread_block threadBlockGroup = this_thread_block();
|
||||||
int threadBlockGroupSize = threadBlockGroup.size();
|
int threadBlockGroupSize = threadBlockGroup.size();
|
||||||
@ -107,24 +109,22 @@ __global__ void cgkernel() {
|
|||||||
|
|
||||||
// master thread in group prints out result
|
// master thread in group prints out result
|
||||||
if (threadBlockGroup.thread_rank() == 0) {
|
if (threadBlockGroup.thread_rank() == 0) {
|
||||||
printf(
|
printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
|
||||||
" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
|
(int)threadBlockGroup.size() - 1,
|
||||||
(int)threadBlockGroup.size() - 1, output, expectedOutput);
|
output,
|
||||||
|
expectedOutput);
|
||||||
|
|
||||||
printf(" Now creating %d groups, each of size 16 threads:\n\n",
|
printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
|
||||||
(int)threadBlockGroup.size() / 16);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
threadBlockGroup.sync();
|
threadBlockGroup.sync();
|
||||||
|
|
||||||
// each tiledPartition16 group includes 16 threads
|
// each tiledPartition16 group includes 16 threads
|
||||||
thread_block_tile<16> tiledPartition16 =
|
thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
|
||||||
tiled_partition<16>(threadBlockGroup);
|
|
||||||
|
|
||||||
// This offset allows each group to have its own unique area in the workspace
|
// This offset allows each group to have its own unique area in the workspace
|
||||||
// array
|
// array
|
||||||
int workspaceOffset =
|
int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
|
||||||
threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
|
|
||||||
|
|
||||||
// input to reduction, for each thread, is its' rank in the group
|
// input to reduction, for each thread, is its' rank in the group
|
||||||
input = tiledPartition16.thread_rank();
|
input = tiledPartition16.thread_rank();
|
||||||
@ -138,10 +138,10 @@ __global__ void cgkernel() {
|
|||||||
|
|
||||||
// each master thread prints out result
|
// each master thread prints out result
|
||||||
if (tiledPartition16.thread_rank() == 0)
|
if (tiledPartition16.thread_rank() == 0)
|
||||||
printf(
|
printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
|
||||||
" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
|
|
||||||
"(expected %d)\n",
|
"(expected %d)\n",
|
||||||
output, expectedOutput);
|
output,
|
||||||
|
expectedOutput);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -149,7 +149,8 @@ __global__ void cgkernel() {
|
|||||||
/**
|
/**
|
||||||
* Host main routine
|
* Host main routine
|
||||||
*/
|
*/
|
||||||
int main() {
|
int main()
|
||||||
|
{
|
||||||
// Error code to check return values for CUDA calls
|
// Error code to check return values for CUDA calls
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
|
|
||||||
@ -166,8 +167,7 @@ int main() {
|
|||||||
err = cudaDeviceSynchronize();
|
err = cudaDeviceSynchronize();
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to launch kernel (error code %s)!\n",
|
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -36,17 +36,17 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes CUDA
|
// includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
static const char *sSDKname = "simpleCubemapTexture";
|
static const char *sSDKname = "simpleCubemapTexture";
|
||||||
|
|
||||||
@ -56,8 +56,8 @@ static const char *sSDKname = "simpleCubemapTexture";
|
|||||||
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
|
//! Transform a cubemap face of a linear buffe using cubemap texture lookups
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void transformKernel(float *g_odata, int width,
|
__global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
|
||||||
cudaTextureObject_t tex) {
|
{
|
||||||
// calculate this thread's data point
|
// calculate this thread's data point
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@ -110,15 +110,15 @@ __global__ void transformKernel(float *g_odata, int width,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// read from texture, do expected transformation and write to global memory
|
// read from texture, do expected transformation and write to global memory
|
||||||
g_odata[face * width * width + y * width + x] =
|
g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
|
||||||
-texCubemap<float>(tex, cx, cy, cz);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
// use command-line specified CUDA device, otherwise use device with highest
|
// use command-line specified CUDA device, otherwise use device with highest
|
||||||
// Gflops/s
|
// Gflops/s
|
||||||
int devID = findCudaDevice(argc, (const char **)argv);
|
int devID = findCudaDevice(argc, (const char **)argv);
|
||||||
@ -129,13 +129,11 @@ int main(int argc, char **argv) {
|
|||||||
cudaDeviceProp deviceProps;
|
cudaDeviceProp deviceProps;
|
||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||||
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
|
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
|
||||||
deviceProps.multiProcessorCount);
|
|
||||||
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
|
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
|
||||||
|
|
||||||
if (deviceProps.major < 2) {
|
if (deviceProps.major < 2) {
|
||||||
printf(
|
printf("%s requires SM 2.0 or higher for support of Texture Arrays. Test "
|
||||||
"%s requires SM 2.0 or higher for support of Texture Arrays. Test "
|
|
||||||
"will exit... \n",
|
"will exit... \n",
|
||||||
sSDKname);
|
sSDKname);
|
||||||
|
|
||||||
@ -157,8 +155,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
for (unsigned int layer = 0; layer < num_layers; layer++) {
|
for (unsigned int layer = 0; layer < num_layers; layer++) {
|
||||||
for (int i = 0; i < (int)(cubemap_size); i++) {
|
for (int i = 0; i < (int)(cubemap_size); i++) {
|
||||||
h_data_ref[layer * cubemap_size + i] =
|
h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
|
||||||
-h_data[layer * cubemap_size + i] + layer;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,19 +164,16 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&d_data, size));
|
checkCudaErrors(cudaMalloc((void **)&d_data, size));
|
||||||
|
|
||||||
// allocate array and copy image data
|
// allocate array and copy image data
|
||||||
cudaChannelFormatDesc channelDesc =
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||||
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
|
||||||
cudaArray *cu_3darray;
|
cudaArray *cu_3darray;
|
||||||
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
|
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
|
||||||
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
|
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
|
||||||
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
|
checkCudaErrors(
|
||||||
make_cudaExtent(width, width, num_faces),
|
cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
|
||||||
cudaArrayCubemap));
|
|
||||||
cudaMemcpy3DParms myparms = {0};
|
cudaMemcpy3DParms myparms = {0};
|
||||||
myparms.srcPos = make_cudaPos(0, 0, 0);
|
myparms.srcPos = make_cudaPos(0, 0, 0);
|
||||||
myparms.dstPos = make_cudaPos(0, 0, 0);
|
myparms.dstPos = make_cudaPos(0, 0, 0);
|
||||||
myparms.srcPtr =
|
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
|
||||||
make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
|
|
||||||
myparms.dstArray = cu_3darray;
|
myparms.dstArray = cu_3darray;
|
||||||
myparms.extent = make_cudaExtent(width, width, num_faces);
|
myparms.extent = make_cudaExtent(width, width, num_faces);
|
||||||
myparms.kind = cudaMemcpyHostToDevice;
|
myparms.kind = cudaMemcpyHostToDevice;
|
||||||
@ -207,10 +201,12 @@ int main(int argc, char **argv) {
|
|||||||
dim3 dimBlock(8, 8, 1);
|
dim3 dimBlock(8, 8, 1);
|
||||||
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
|
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
|
||||||
|
|
||||||
printf(
|
printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
|
||||||
"Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
|
|
||||||
"block has 8 x 8 threads\n",
|
"block has 8 x 8 threads\n",
|
||||||
width, num_layers, dimGrid.x, dimGrid.y);
|
width,
|
||||||
|
num_layers,
|
||||||
|
dimGrid.x,
|
||||||
|
dimGrid.y);
|
||||||
|
|
||||||
transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
|
transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
|
||||||
tex); // warmup (for better timing)
|
tex); // warmup (for better timing)
|
||||||
@ -233,8 +229,7 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
|
||||||
printf("%.2f Mtexlookups/sec\n",
|
printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
|
||||||
(cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
|
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
@ -245,14 +240,13 @@ int main(int argc, char **argv) {
|
|||||||
// write regression file if necessary
|
// write regression file if necessary
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// write file for regression test
|
// write file for regression test
|
||||||
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f,
|
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
|
||||||
false);
|
}
|
||||||
} else {
|
else {
|
||||||
printf("Comparing kernel output to expected data\n");
|
printf("Comparing kernel output to expected data\n");
|
||||||
|
|
||||||
#define MIN_EPSILON_ERROR 5e-3f
|
#define MIN_EPSILON_ERROR 5e-3f
|
||||||
bResult =
|
bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
|
||||||
compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// cleanup memory
|
// cleanup memory
|
||||||
|
|||||||
@ -33,12 +33,12 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Includes
|
// Includes
|
||||||
|
#include <cstring>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#include <iostream>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
@ -66,11 +66,10 @@ int CleanupNoFailure(CUcontext &cuContext);
|
|||||||
void RandomInit(float *, int);
|
void RandomInit(float *, int);
|
||||||
bool findModulePath(const char *, string &, char **, ostringstream &);
|
bool findModulePath(const char *, string &, char **, ostringstream &);
|
||||||
|
|
||||||
static void check(CUresult result, char const *const func,
|
static void check(CUresult result, char const *const func, const char *const file, int const line)
|
||||||
const char *const file, int const line) {
|
{
|
||||||
if (result) {
|
if (result) {
|
||||||
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line,
|
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
|
||||||
static_cast<unsigned int>(result), func);
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -78,7 +77,8 @@ static void check(CUresult result, char const *const func,
|
|||||||
#define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
|
#define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
|
||||||
|
|
||||||
// Host code
|
// Host code
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("simpleDrvRuntime..\n");
|
printf("simpleDrvRuntime..\n");
|
||||||
int N = 50000, devID = 0;
|
int N = 50000, devID = 0;
|
||||||
size_t size = N * sizeof(float);
|
size_t size = N * sizeof(float);
|
||||||
@ -100,7 +100,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
|
if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,8 +114,7 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||||
|
|
||||||
// Get function handle from module
|
// Get function handle from module
|
||||||
checkCudaDrvErrors(
|
checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
|
||||||
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
|
|
||||||
|
|
||||||
// Allocate input vectors h_A and h_B in host memory
|
// Allocate input vectors h_A and h_B in host memory
|
||||||
checkCudaErrors(cudaMallocHost(&h_A, size));
|
checkCudaErrors(cudaMallocHost(&h_A, size));
|
||||||
@ -133,10 +133,8 @@ int main(int argc, char **argv) {
|
|||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
// Copy vectors from host memory to device memory
|
// Copy vectors from host memory to device memory
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
|
||||||
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
|
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
|
||||||
checkCudaErrors(
|
|
||||||
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
|
|
||||||
|
|
||||||
int threadsPerBlock = 256;
|
int threadsPerBlock = 256;
|
||||||
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
@ -144,14 +142,12 @@ int main(int argc, char **argv) {
|
|||||||
void *args[] = {&d_A, &d_B, &d_C, &N};
|
void *args[] = {&d_A, &d_B, &d_C, &N};
|
||||||
|
|
||||||
// Launch the CUDA kernel
|
// Launch the CUDA kernel
|
||||||
checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
|
checkCudaDrvErrors(
|
||||||
threadsPerBlock, 1, 1, 0, stream, args,
|
cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
|
||||||
NULL));
|
|
||||||
|
|
||||||
// Copy result from device memory to host memory
|
// Copy result from device memory to host memory
|
||||||
// h_C contains the result in host memory
|
// h_C contains the result in host memory
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
|
||||||
cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
|
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
// Verify result
|
// Verify result
|
||||||
int i;
|
int i;
|
||||||
@ -171,7 +167,8 @@ int main(int argc, char **argv) {
|
|||||||
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int CleanupNoFailure(CUcontext &cuContext) {
|
int CleanupNoFailure(CUcontext &cuContext)
|
||||||
|
{
|
||||||
// Free device memory
|
// Free device memory
|
||||||
checkCudaErrors(cudaFree(d_A));
|
checkCudaErrors(cudaFree(d_A));
|
||||||
checkCudaErrors(cudaFree(d_B));
|
checkCudaErrors(cudaFree(d_B));
|
||||||
@ -195,19 +192,21 @@ int CleanupNoFailure(CUcontext &cuContext) {
|
|||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
// Allocates an array with random float entries.
|
// Allocates an array with random float entries.
|
||||||
void RandomInit(float *data, int n) {
|
void RandomInit(float *data, int n)
|
||||||
|
{
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
data[i] = rand() / (float)RAND_MAX;
|
data[i] = rand() / (float)RAND_MAX;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool inline findModulePath(const char *module_file, string &module_path,
|
bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
|
||||||
char **argv, ostringstream &ostrm) {
|
{
|
||||||
char *actual_path = sdkFindFilePath(module_file, argv[0]);
|
char *actual_path = sdkFindFilePath(module_file, argv[0]);
|
||||||
|
|
||||||
if (actual_path) {
|
if (actual_path) {
|
||||||
module_path = actual_path;
|
module_path = actual_path;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> findModulePath file not found: <%s> \n", module_file);
|
printf("> findModulePath file not found: <%s> \n", module_file);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -215,7 +214,8 @@ bool inline findModulePath(const char *module_file, string &module_path,
|
|||||||
if (module_path.empty()) {
|
if (module_path.empty()) {
|
||||||
printf("> findModulePath could not find file: <%s> \n", module_file);
|
printf("> findModulePath could not find file: <%s> \n", module_file);
|
||||||
return false;
|
return false;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> findModulePath found file at <%s>\n", module_path.c_str());
|
printf("> findModulePath found file at <%s>\n", module_path.c_str());
|
||||||
if (module_path.rfind("fatbin") != string::npos) {
|
if (module_path.rfind("fatbin") != string::npos) {
|
||||||
ifstream fileIn(module_path.c_str(), ios::binary);
|
ifstream fileIn(module_path.c_str(), ios::binary);
|
||||||
|
|||||||
@ -34,9 +34,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Device code
|
// Device code
|
||||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
|
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
|
||||||
float *C, int N) {
|
{
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < N) C[i] = A[i] + B[i];
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -44,7 +44,8 @@ const char *sSDKsample = "hyperQ";
|
|||||||
|
|
||||||
// This subroutine does no real work but runs for at least the specified number
|
// This subroutine does no real work but runs for at least the specified number
|
||||||
// of clock ticks.
|
// of clock ticks.
|
||||||
__device__ void clock_block(clock_t *d_o, clock_t clock_count) {
|
__device__ void clock_block(clock_t *d_o, clock_t clock_count)
|
||||||
|
{
|
||||||
unsigned int start_clock = (unsigned int)clock();
|
unsigned int start_clock = (unsigned int)clock();
|
||||||
|
|
||||||
clock_t clock_offset = 0;
|
clock_t clock_offset = 0;
|
||||||
@ -71,15 +72,12 @@ __device__ void clock_block(clock_t *d_o, clock_t clock_count) {
|
|||||||
// We create two identical kernels calling clock_block(), we create two so that
|
// We create two identical kernels calling clock_block(), we create two so that
|
||||||
// we can identify dependencies in the profile timeline ("kernel_B" is always
|
// we can identify dependencies in the profile timeline ("kernel_B" is always
|
||||||
// dependent on "kernel_A" in the same stream).
|
// dependent on "kernel_A" in the same stream).
|
||||||
__global__ void kernel_A(clock_t *d_o, clock_t clock_count) {
|
__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
|
||||||
clock_block(d_o, clock_count);
|
__global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
|
||||||
}
|
|
||||||
__global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
|
|
||||||
clock_block(d_o, clock_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Single-warp reduction kernel (note: this is not optimized for simplicity)
|
// Single-warp reduction kernel (note: this is not optimized for simplicity)
|
||||||
__global__ void sum(clock_t *d_clocks, int N) {
|
__global__ void sum(clock_t *d_clocks, int N)
|
||||||
|
{
|
||||||
// Handle to thread block group
|
// Handle to thread block group
|
||||||
cg::thread_block cta = cg::this_thread_block();
|
cg::thread_block cta = cg::this_thread_block();
|
||||||
__shared__ clock_t s_clocks[32];
|
__shared__ clock_t s_clocks[32];
|
||||||
@ -106,7 +104,8 @@ __global__ void sum(clock_t *d_clocks, int N) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int nstreams = 32; // One stream for each pair of kernels
|
int nstreams = 32; // One stream for each pair of kernels
|
||||||
float kernel_time = 10; // Time each kernel should run in ms
|
float kernel_time = 10; // Time each kernel should run in ms
|
||||||
float elapsed_time;
|
float elapsed_time;
|
||||||
@ -131,18 +130,20 @@ int main(int argc, char **argv) {
|
|||||||
// HyperQ is available in devices of Compute Capability 3.5 and higher
|
// HyperQ is available in devices of Compute Capability 3.5 and higher
|
||||||
if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
|
if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
|
||||||
if (deviceProp.concurrentKernels == 0) {
|
if (deviceProp.concurrentKernels == 0) {
|
||||||
printf(
|
printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
|
||||||
"> GPU does not support concurrent kernel execution (SM 3.5 or "
|
|
||||||
"higher required)\n");
|
"higher required)\n");
|
||||||
printf(" CUDA kernel runs will be serialized\n");
|
printf(" CUDA kernel runs will be serialized\n");
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> GPU does not support HyperQ\n");
|
printf("> GPU does not support HyperQ\n");
|
||||||
printf(" CUDA kernel runs will have limited concurrency\n");
|
printf(" CUDA kernel runs will have limited concurrency\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
|
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
|
||||||
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
|
deviceProp.major,
|
||||||
|
deviceProp.minor,
|
||||||
|
deviceProp.multiProcessorCount);
|
||||||
|
|
||||||
// Allocate host memory for the output (reduced to a single value)
|
// Allocate host memory for the output (reduced to a single value)
|
||||||
clock_t *a = 0;
|
clock_t *a = 0;
|
||||||
@ -153,8 +154,7 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
|
checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
|
||||||
|
|
||||||
// Allocate and initialize an array of stream handles
|
// Allocate and initialize an array of stream handles
|
||||||
cudaStream_t *streams =
|
cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
|
||||||
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
|
|
||||||
|
|
||||||
for (int i = 0; i < nstreams; i++) {
|
for (int i = 0; i < nstreams; i++) {
|
||||||
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
||||||
@ -203,15 +203,15 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaEventSynchronize(stop_event));
|
checkCudaErrors(cudaEventSynchronize(stop_event));
|
||||||
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
|
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
|
||||||
|
|
||||||
printf(
|
printf("Expected time for serial execution of %d sets of kernels is between "
|
||||||
"Expected time for serial execution of %d sets of kernels is between "
|
|
||||||
"approx. %.3fs and %.3fs\n",
|
"approx. %.3fs and %.3fs\n",
|
||||||
nstreams, (nstreams + 1) * kernel_time / 1000.0f,
|
nstreams,
|
||||||
|
(nstreams + 1) * kernel_time / 1000.0f,
|
||||||
2 * nstreams * kernel_time / 1000.0f);
|
2 * nstreams * kernel_time / 1000.0f);
|
||||||
printf(
|
printf("Expected time for fully concurrent execution of %d sets of kernels is "
|
||||||
"Expected time for fully concurrent execution of %d sets of kernels is "
|
|
||||||
"approx. %.3fs\n",
|
"approx. %.3fs\n",
|
||||||
nstreams, 2 * kernel_time / 1000.0f);
|
nstreams,
|
||||||
|
2 * kernel_time / 1000.0f);
|
||||||
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
|
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
|
||||||
|
|
||||||
bool bTestResult = (a[0] >= total_clocks);
|
bool bTestResult = (a[0] >= total_clocks);
|
||||||
|
|||||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
|
|||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "helper_cuda.h"
|
#include "helper_cuda.h"
|
||||||
#include "helper_multiprocess.h"
|
#include "helper_multiprocess.h"
|
||||||
static const char shmName[] = "simpleIPCshm";
|
static const char shmName[] = "simpleIPCshm";
|
||||||
@ -49,7 +50,8 @@ static const char shmName[] = "simpleIPCshm";
|
|||||||
#error Unsupported system
|
#error Unsupported system
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct shmStruct_st {
|
typedef struct shmStruct_st
|
||||||
|
{
|
||||||
size_t nprocesses;
|
size_t nprocesses;
|
||||||
int barrier;
|
int barrier;
|
||||||
int sense;
|
int sense;
|
||||||
@ -58,15 +60,16 @@ typedef struct shmStruct_st {
|
|||||||
cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
|
cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
|
||||||
} shmStruct;
|
} shmStruct;
|
||||||
|
|
||||||
__global__ void simpleKernel(char *ptr, int sz, char val) {
|
__global__ void simpleKernel(char *ptr, int sz, char val)
|
||||||
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
|
for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
|
||||||
ptr[idx] = val;
|
ptr[idx] = val;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void barrierWait(volatile int *barrier, volatile int *sense,
|
static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
|
||||||
unsigned int n) {
|
{
|
||||||
int count;
|
int count;
|
||||||
|
|
||||||
// Check-in
|
// Check-in
|
||||||
@ -84,7 +87,8 @@ static void barrierWait(volatile int *barrier, volatile int *sense,
|
|||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void childProcess(int id) {
|
static void childProcess(int id)
|
||||||
|
{
|
||||||
volatile shmStruct *shm = NULL;
|
volatile shmStruct *shm = NULL;
|
||||||
cudaStream_t stream;
|
cudaStream_t stream;
|
||||||
sharedMemoryInfo info;
|
sharedMemoryInfo info;
|
||||||
@ -108,8 +112,7 @@ static void childProcess(int id) {
|
|||||||
checkCudaErrors(cudaSetDevice(shm->devices[id]));
|
checkCudaErrors(cudaSetDevice(shm->devices[id]));
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
|
checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
|
||||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
|
||||||
&blocks, simpleKernel, threads, 0));
|
|
||||||
blocks *= prop.multiProcessorCount;
|
blocks *= prop.multiProcessorCount;
|
||||||
|
|
||||||
// Open and track all the allocations and events created in the master
|
// Open and track all the allocations and events created in the master
|
||||||
@ -121,10 +124,8 @@ static void childProcess(int id) {
|
|||||||
// Notice, we don't need to explicitly enable peer access for
|
// Notice, we don't need to explicitly enable peer access for
|
||||||
// allocations on other devices.
|
// allocations on other devices.
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
|
cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
|
||||||
cudaIpcMemLazyEnablePeerAccess));
|
checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
|
||||||
checkCudaErrors(cudaIpcOpenEventHandle(
|
|
||||||
&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
|
|
||||||
|
|
||||||
ptrs.push_back(ptr);
|
ptrs.push_back(ptr);
|
||||||
events.push_back(event);
|
events.push_back(event);
|
||||||
@ -141,8 +142,7 @@ static void childProcess(int id) {
|
|||||||
// Wait for the buffer to be accessed to be ready
|
// Wait for the buffer to be accessed to be ready
|
||||||
checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
|
checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
|
||||||
// Push a simple kernel on it
|
// Push a simple kernel on it
|
||||||
simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
|
simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
|
||||||
DATA_SIZE, id);
|
|
||||||
checkCudaErrors(cudaGetLastError());
|
checkCudaErrors(cudaGetLastError());
|
||||||
// Signal that this buffer is ready for the next consumer
|
// Signal that this buffer is ready for the next consumer
|
||||||
checkCudaErrors(cudaEventRecord(events[bufferId], stream));
|
checkCudaErrors(cudaEventRecord(events[bufferId], stream));
|
||||||
@ -158,8 +158,7 @@ static void childProcess(int id) {
|
|||||||
|
|
||||||
// Now wait for my buffer to be ready so I can copy it locally and verify it
|
// Now wait for my buffer to be ready so I can copy it locally and verify it
|
||||||
checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
|
checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
|
||||||
checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
|
checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
|
||||||
cudaMemcpyDeviceToHost, stream));
|
|
||||||
// And wait for all the queued up work to complete
|
// And wait for all the queued up work to complete
|
||||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
@ -169,8 +168,11 @@ static void childProcess(int id) {
|
|||||||
char compareId = (char)((id + 1) % procCount);
|
char compareId = (char)((id + 1) % procCount);
|
||||||
for (unsigned long long j = 0; j < DATA_SIZE; j++) {
|
for (unsigned long long j = 0; j < DATA_SIZE; j++) {
|
||||||
if (verification_buffer[j] != compareId) {
|
if (verification_buffer[j] != compareId) {
|
||||||
printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j,
|
printf("Process %d: Verification mismatch at %lld: %d != %d\n",
|
||||||
(int)verification_buffer[j], (int)compareId);
|
id,
|
||||||
|
j,
|
||||||
|
(int)verification_buffer[j],
|
||||||
|
(int)compareId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -185,7 +187,8 @@ static void childProcess(int id) {
|
|||||||
printf("Process %d complete!\n", id);
|
printf("Process %d complete!\n", id);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void parentProcess(char *app) {
|
static void parentProcess(char *app)
|
||||||
|
{
|
||||||
sharedMemoryInfo info;
|
sharedMemoryInfo info;
|
||||||
int devCount, i;
|
int devCount, i;
|
||||||
volatile shmStruct *shm = NULL;
|
volatile shmStruct *shm = NULL;
|
||||||
@ -219,17 +222,14 @@ static void parentProcess(char *app) {
|
|||||||
// This sample requires two processes accessing each device, so we need
|
// This sample requires two processes accessing each device, so we need
|
||||||
// to ensure exclusive or prohibited mode is not set
|
// to ensure exclusive or prohibited mode is not set
|
||||||
if (prop.computeMode != cudaComputeModeDefault) {
|
if (prop.computeMode != cudaComputeModeDefault) {
|
||||||
printf("Device %d is in an unsupported compute mode for this sample\n",
|
printf("Device %d is in an unsupported compute mode for this sample\n", i);
|
||||||
i);
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = 0; j < shm->nprocesses; j++) {
|
for (int j = 0; j < shm->nprocesses; j++) {
|
||||||
int canAccessPeerIJ, canAccessPeerJI;
|
int canAccessPeerIJ, canAccessPeerJI;
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
|
||||||
cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
|
checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
|
||||||
checkCudaErrors(
|
|
||||||
cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
|
|
||||||
if (!canAccessPeerIJ || !canAccessPeerJI) {
|
if (!canAccessPeerIJ || !canAccessPeerJI) {
|
||||||
allPeers = false;
|
allPeers = false;
|
||||||
break;
|
break;
|
||||||
@ -246,10 +246,11 @@ static void parentProcess(char *app) {
|
|||||||
checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
|
checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
|
||||||
}
|
}
|
||||||
shm->devices[shm->nprocesses++] = i;
|
shm->devices[shm->nprocesses++] = i;
|
||||||
if (shm->nprocesses >= MAX_DEVICES) break;
|
if (shm->nprocesses >= MAX_DEVICES)
|
||||||
} else {
|
break;
|
||||||
printf(
|
}
|
||||||
"Device %d is not peer capable with some other selected peers, "
|
else {
|
||||||
|
printf("Device %d is not peer capable with some other selected peers, "
|
||||||
"skipping\n",
|
"skipping\n",
|
||||||
i);
|
i);
|
||||||
}
|
}
|
||||||
@ -268,12 +269,9 @@ static void parentProcess(char *app) {
|
|||||||
|
|
||||||
checkCudaErrors(cudaSetDevice(shm->devices[i]));
|
checkCudaErrors(cudaSetDevice(shm->devices[i]));
|
||||||
checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
|
checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
|
||||||
cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
|
checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
|
||||||
checkCudaErrors(cudaEventCreate(
|
checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
|
||||||
&event, cudaEventDisableTiming | cudaEventInterprocess));
|
|
||||||
checkCudaErrors(cudaIpcGetEventHandle(
|
|
||||||
(cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
|
|
||||||
|
|
||||||
ptrs.push_back(ptr);
|
ptrs.push_back(ptr);
|
||||||
events.push_back(event);
|
events.push_back(event);
|
||||||
@ -314,14 +312,16 @@ static void parentProcess(char *app) {
|
|||||||
sharedMemoryClose(&info);
|
sharedMemoryClose(&info);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
#if defined(__arm__) || defined(__aarch64__)
|
#if defined(__arm__) || defined(__aarch64__)
|
||||||
printf("Not supported on ARM\n");
|
printf("Not supported on ARM\n");
|
||||||
return EXIT_WAIVED;
|
return EXIT_WAIVED;
|
||||||
#else
|
#else
|
||||||
if (argc == 1) {
|
if (argc == 1) {
|
||||||
parentProcess(argv[0]);
|
parentProcess(argv[0]);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
childProcess(atoi(argv[1]));
|
childProcess(atoi(argv[1]));
|
||||||
}
|
}
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
|
|||||||
@ -36,10 +36,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, kernels
|
// includes, kernels
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@ -54,8 +54,8 @@ static const char *sSDKname = "simpleLayeredTexture";
|
|||||||
//! Transform a layer of a layered 2D texture using texture lookups
|
//! Transform a layer of a layered 2D texture using texture lookups
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void transformKernel(float *g_odata, int width, int height,
|
__global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
|
||||||
int layer, cudaTextureObject_t tex) {
|
{
|
||||||
// calculate this thread's data point
|
// calculate this thread's data point
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@ -67,14 +67,14 @@ __global__ void transformKernel(float *g_odata, int width, int height,
|
|||||||
float v = (y + 0.5f) / (float)height;
|
float v = (y + 0.5f) / (float)height;
|
||||||
|
|
||||||
// read from texture, do expected transformation and write to global memory
|
// read from texture, do expected transformation and write to global memory
|
||||||
g_odata[layer * width * height + y * width + x] =
|
g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
|
||||||
-tex2DLayered<float>(tex, u, v, layer) + layer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("[%s] - Starting...\n", sSDKname);
|
printf("[%s] - Starting...\n", sSDKname);
|
||||||
|
|
||||||
// use command-line specified CUDA device, otherwise use device with highest
|
// use command-line specified CUDA device, otherwise use device with highest
|
||||||
@ -87,8 +87,7 @@ int main(int argc, char **argv) {
|
|||||||
cudaDeviceProp deviceProps;
|
cudaDeviceProp deviceProps;
|
||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||||
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name,
|
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
|
||||||
deviceProps.multiProcessorCount);
|
|
||||||
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
|
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
|
||||||
|
|
||||||
// generate input data for layered texture
|
// generate input data for layered texture
|
||||||
@ -106,8 +105,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
for (unsigned int layer = 0; layer < num_layers; layer++)
|
for (unsigned int layer = 0; layer < num_layers; layer++)
|
||||||
for (int i = 0; i < (int)(width * height); i++) {
|
for (int i = 0; i < (int)(width * height); i++) {
|
||||||
h_data_ref[layer * width * height + i] =
|
h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
|
||||||
-h_data[layer * width * height + i] + layer;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
@ -115,17 +113,14 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&d_data, size));
|
checkCudaErrors(cudaMalloc((void **)&d_data, size));
|
||||||
|
|
||||||
// allocate array and copy image data
|
// allocate array and copy image data
|
||||||
cudaChannelFormatDesc channelDesc =
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||||
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
|
||||||
cudaArray *cu_3darray;
|
cudaArray *cu_3darray;
|
||||||
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
|
checkCudaErrors(
|
||||||
make_cudaExtent(width, height, num_layers),
|
cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
|
||||||
cudaArrayLayered));
|
|
||||||
cudaMemcpy3DParms myparms = {0};
|
cudaMemcpy3DParms myparms = {0};
|
||||||
myparms.srcPos = make_cudaPos(0, 0, 0);
|
myparms.srcPos = make_cudaPos(0, 0, 0);
|
||||||
myparms.dstPos = make_cudaPos(0, 0, 0);
|
myparms.dstPos = make_cudaPos(0, 0, 0);
|
||||||
myparms.srcPtr =
|
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
|
||||||
make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
|
|
||||||
myparms.dstArray = cu_3darray;
|
myparms.dstArray = cu_3darray;
|
||||||
myparms.extent = make_cudaExtent(width, height, num_layers);
|
myparms.extent = make_cudaExtent(width, height, num_layers);
|
||||||
myparms.kind = cudaMemcpyHostToDevice;
|
myparms.kind = cudaMemcpyHostToDevice;
|
||||||
@ -152,10 +147,12 @@ int main(int argc, char **argv) {
|
|||||||
dim3 dimBlock(8, 8, 1);
|
dim3 dimBlock(8, 8, 1);
|
||||||
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
|
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
|
||||||
|
|
||||||
printf(
|
printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
|
||||||
"Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
|
|
||||||
"8 x 8 threads\n",
|
"8 x 8 threads\n",
|
||||||
width, height, dimGrid.x, dimGrid.y);
|
width,
|
||||||
|
height,
|
||||||
|
dimGrid.x,
|
||||||
|
dimGrid.y);
|
||||||
|
|
||||||
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
|
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
|
||||||
tex); // warmup (for better timing)
|
tex); // warmup (for better timing)
|
||||||
@ -171,8 +168,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
// execute the kernel
|
// execute the kernel
|
||||||
for (unsigned int layer = 0; layer < num_layers; layer++)
|
for (unsigned int layer = 0; layer < num_layers; layer++)
|
||||||
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer,
|
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
|
||||||
tex);
|
|
||||||
|
|
||||||
// check if kernel execution generated an error
|
// check if kernel execution generated an error
|
||||||
getLastCudaError("Kernel execution failed");
|
getLastCudaError("Kernel execution failed");
|
||||||
@ -180,9 +176,7 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
|
||||||
printf("%.2f Mtexlookups/sec\n",
|
printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
|
||||||
(width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
|
|
||||||
1e6));
|
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
@ -193,14 +187,13 @@ int main(int argc, char **argv) {
|
|||||||
// write regression file if necessary
|
// write regression file if necessary
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// write file for regression test
|
// write file for regression test
|
||||||
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
|
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
|
||||||
false);
|
}
|
||||||
} else {
|
else {
|
||||||
printf("Comparing kernel output to expected data\n");
|
printf("Comparing kernel output to expected data\n");
|
||||||
|
|
||||||
#define MIN_EPSILON_ERROR 5e-3f
|
#define MIN_EPSILON_ERROR 5e-3f
|
||||||
bResult = compareData(h_odata, h_data_ref, width * height * num_layers,
|
bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
|
||||||
MIN_EPSILON_ERROR, 0.0f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// cleanup memory
|
// cleanup memory
|
||||||
|
|||||||
@ -42,8 +42,8 @@
|
|||||||
// System includes
|
// System includes
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
using std::cout;
|
|
||||||
using std::cerr;
|
using std::cerr;
|
||||||
|
using std::cout;
|
||||||
using std::endl;
|
using std::endl;
|
||||||
|
|
||||||
// User include
|
// User include
|
||||||
@ -58,7 +58,8 @@ using std::endl;
|
|||||||
|
|
||||||
// Host code
|
// Host code
|
||||||
// No CUDA here, only MPI
|
// No CUDA here, only MPI
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
// Dimensions of the dataset
|
// Dimensions of the dataset
|
||||||
int blockSize = 256;
|
int blockSize = 256;
|
||||||
int gridSize = 10000;
|
int gridSize = 10000;
|
||||||
@ -87,8 +88,8 @@ int main(int argc, char *argv[]) {
|
|||||||
float *dataNode = new float[dataSizePerNode];
|
float *dataNode = new float[dataSizePerNode];
|
||||||
|
|
||||||
// Dispatch a portion of the input data to each node
|
// Dispatch a portion of the input data to each node
|
||||||
MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode,
|
MPI_CHECK(
|
||||||
dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
|
MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
|
||||||
|
|
||||||
if (commRank == 0) {
|
if (commRank == 0) {
|
||||||
// No need for root data any more
|
// No need for root data any more
|
||||||
@ -102,8 +103,7 @@ int main(int argc, char *argv[]) {
|
|||||||
float sumNode = sum(dataNode, dataSizePerNode);
|
float sumNode = sum(dataNode, dataSizePerNode);
|
||||||
float sumRoot;
|
float sumRoot;
|
||||||
|
|
||||||
MPI_CHECK(
|
MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
|
||||||
MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
|
|
||||||
|
|
||||||
if (commRank == 0) {
|
if (commRank == 0) {
|
||||||
float average = sumRoot / dataSizeTotal;
|
float average = sumRoot / dataSizeTotal;
|
||||||
@ -122,7 +122,8 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Shut down MPI cleanly if something goes wrong
|
// Shut down MPI cleanly if something goes wrong
|
||||||
void my_abort(int err) {
|
void my_abort(int err)
|
||||||
|
{
|
||||||
cout << "Test FAILED\n";
|
cout << "Test FAILED\n";
|
||||||
MPI_Abort(MPI_COMM_WORLD, err);
|
MPI_Abort(MPI_COMM_WORLD, err);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -51,13 +51,15 @@ using std::endl;
|
|||||||
|
|
||||||
// Device code
|
// Device code
|
||||||
// Very simple GPU Kernel that computes square roots of input numbers
|
// Very simple GPU Kernel that computes square roots of input numbers
|
||||||
__global__ void simpleMPIKernel(float *input, float *output) {
|
__global__ void simpleMPIKernel(float *input, float *output)
|
||||||
|
{
|
||||||
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
output[tid] = sqrt(input[tid]);
|
output[tid] = sqrt(input[tid]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Initialize an array with random data (between 0 and 1)
|
// Initialize an array with random data (between 0 and 1)
|
||||||
void initData(float *data, int dataSize) {
|
void initData(float *data, int dataSize)
|
||||||
|
{
|
||||||
for (int i = 0; i < dataSize; i++) {
|
for (int i = 0; i < dataSize; i++) {
|
||||||
data[i] = (float)rand() / RAND_MAX;
|
data[i] = (float)rand() / RAND_MAX;
|
||||||
}
|
}
|
||||||
@ -65,7 +67,8 @@ void initData(float *data, int dataSize) {
|
|||||||
|
|
||||||
// CUDA computation on each node
|
// CUDA computation on each node
|
||||||
// No MPI here, only CUDA
|
// No MPI here, only CUDA
|
||||||
void computeGPU(float *hostData, int blockSize, int gridSize) {
|
void computeGPU(float *hostData, int blockSize, int gridSize)
|
||||||
|
{
|
||||||
int dataSize = blockSize * gridSize;
|
int dataSize = blockSize * gridSize;
|
||||||
|
|
||||||
// Allocate data on GPU memory
|
// Allocate data on GPU memory
|
||||||
@ -76,22 +79,21 @@ void computeGPU(float *hostData, int blockSize, int gridSize) {
|
|||||||
CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
|
CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
|
||||||
|
|
||||||
// Copy to GPU memory
|
// Copy to GPU memory
|
||||||
CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float),
|
CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
// Run kernel
|
// Run kernel
|
||||||
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
|
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
|
||||||
|
|
||||||
// Copy data back to CPU memory
|
// Copy data back to CPU memory
|
||||||
CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float),
|
CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
// Free GPU memory
|
// Free GPU memory
|
||||||
CUDA_CHECK(cudaFree(deviceInputData));
|
CUDA_CHECK(cudaFree(deviceInputData));
|
||||||
CUDA_CHECK(cudaFree(deviceOutputData));
|
CUDA_CHECK(cudaFree(deviceOutputData));
|
||||||
}
|
}
|
||||||
|
|
||||||
float sum(float *data, int size) {
|
float sum(float *data, int size)
|
||||||
|
{
|
||||||
float accum = 0.f;
|
float accum = 0.f;
|
||||||
|
|
||||||
for (int i = 0; i < size; i++) {
|
for (int i = 0; i < size; i++) {
|
||||||
|
|||||||
@ -36,7 +36,8 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Forward declarations
|
// Forward declarations
|
||||||
extern "C" {
|
extern "C"
|
||||||
|
{
|
||||||
void initData(float *data, int dataSize);
|
void initData(float *data, int dataSize);
|
||||||
void computeGPU(float *hostData, int blockSize, int gridSize);
|
void computeGPU(float *hostData, int blockSize, int gridSize);
|
||||||
float sum(float *data, int size);
|
float sum(float *data, int size);
|
||||||
|
|||||||
@ -55,7 +55,8 @@ const char *sSDKname = "simpleMultiCopy";
|
|||||||
// includes, kernels
|
// includes, kernels
|
||||||
// Declare the CUDA kernels here and main() code that is needed to launch
|
// Declare the CUDA kernels here and main() code that is needed to launch
|
||||||
// Compute workload on the system
|
// Compute workload on the system
|
||||||
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) {
|
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
|
||||||
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (idx < N) {
|
if (idx < N) {
|
||||||
@ -102,7 +103,8 @@ bool test();
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
int cuda_device = 0;
|
int cuda_device = 0;
|
||||||
float scale_factor;
|
float scale_factor;
|
||||||
cudaDeviceProp deviceProp;
|
cudaDeviceProp deviceProp;
|
||||||
@ -115,7 +117,8 @@ int main(int argc, char *argv[]) {
|
|||||||
if (cuda_device < 0) {
|
if (cuda_device < 0) {
|
||||||
printf("Invalid command line parameters\n");
|
printf("Invalid command line parameters\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("cuda_device = %d\n", cuda_device);
|
printf("cuda_device = %d\n", cuda_device);
|
||||||
cuda_device = gpuDeviceInit(cuda_device);
|
cuda_device = gpuDeviceInit(cuda_device);
|
||||||
|
|
||||||
@ -124,7 +127,8 @@ int main(int argc, char *argv[]) {
|
|||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// Otherwise pick the device with the highest Gflops/s
|
// Otherwise pick the device with the highest Gflops/s
|
||||||
cuda_device = gpuGetMaxGflopsDeviceId();
|
cuda_device = gpuGetMaxGflopsDeviceId();
|
||||||
checkCudaErrors(cudaSetDevice(cuda_device));
|
checkCudaErrors(cudaSetDevice(cuda_device));
|
||||||
@ -133,22 +137,23 @@ int main(int argc, char *argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
|
||||||
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name,
|
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
|
||||||
|
deviceProp.name,
|
||||||
deviceProp.multiProcessorCount,
|
deviceProp.multiProcessorCount,
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
||||||
deviceProp.multiProcessorCount);
|
|
||||||
|
|
||||||
// Anything that is less than 32 Cores will have scaled down workload
|
// Anything that is less than 32 Cores will have scaled down workload
|
||||||
scale_factor =
|
scale_factor =
|
||||||
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
|
||||||
(float)deviceProp.multiProcessorCount)),
|
|
||||||
1.0f);
|
1.0f);
|
||||||
N = (int)((float)N / scale_factor);
|
N = (int)((float)N / scale_factor);
|
||||||
|
|
||||||
printf("> Device name: %s\n", deviceProp.name);
|
printf("> Device name: %s\n", deviceProp.name);
|
||||||
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
|
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
|
||||||
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
|
deviceProp.major,
|
||||||
|
deviceProp.minor,
|
||||||
|
deviceProp.multiProcessorCount);
|
||||||
printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
|
printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
|
||||||
printf("> array_size = %d\n\n", N);
|
printf("> array_size = %d\n\n", N);
|
||||||
|
|
||||||
@ -165,13 +170,11 @@ int main(int argc, char *argv[]) {
|
|||||||
h_data_sink = (int *)malloc(memsize);
|
h_data_sink = (int *)malloc(memsize);
|
||||||
|
|
||||||
for (int i = 0; i < STREAM_COUNT; ++i) {
|
for (int i = 0; i < STREAM_COUNT; ++i) {
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
|
||||||
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
|
|
||||||
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
|
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
|
||||||
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
|
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
|
||||||
cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
|
|
||||||
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
|
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
|
||||||
|
|
||||||
checkCudaErrors(cudaStreamCreate(&stream[i]));
|
checkCudaErrors(cudaStreamCreate(&stream[i]));
|
||||||
@ -190,8 +193,7 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
// Time copies and kernel
|
// Time copies and kernel
|
||||||
cudaEventRecord(start, 0);
|
cudaEventRecord(start, 0);
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
|
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
|
||||||
cudaMemcpyHostToDevice, 0));
|
|
||||||
cudaEventRecord(stop, 0);
|
cudaEventRecord(stop, 0);
|
||||||
cudaEventSynchronize(stop);
|
cudaEventSynchronize(stop);
|
||||||
|
|
||||||
@ -199,8 +201,7 @@ int main(int argc, char *argv[]) {
|
|||||||
cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
|
cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
|
||||||
|
|
||||||
cudaEventRecord(start, 0);
|
cudaEventRecord(start, 0);
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize,
|
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
|
||||||
cudaMemcpyDeviceToHost, 0));
|
|
||||||
cudaEventRecord(stop, 0);
|
cudaEventRecord(stop, 0);
|
||||||
cudaEventSynchronize(stop);
|
cudaEventSynchronize(stop);
|
||||||
|
|
||||||
@ -217,35 +218,27 @@ int main(int argc, char *argv[]) {
|
|||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Relevant properties of this CUDA device\n");
|
printf("Relevant properties of this CUDA device\n");
|
||||||
printf(
|
printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
|
||||||
"(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
|
|
||||||
"(device property \"deviceOverlap\")\n",
|
"(device property \"deviceOverlap\")\n",
|
||||||
deviceProp.deviceOverlap ? "X" : " ");
|
deviceProp.deviceOverlap ? "X" : " ");
|
||||||
// printf("(%s) Can execute several GPU kernels simultaneously (compute
|
// printf("(%s) Can execute several GPU kernels simultaneously (compute
|
||||||
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
|
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
|
||||||
printf(
|
printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
|
||||||
"(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
|
|
||||||
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
|
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
|
||||||
"4000/5000/6000/K5000)\n",
|
"4000/5000/6000/K5000)\n",
|
||||||
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
|
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Measured timings (throughput):\n");
|
printf("Measured timings (throughput):\n");
|
||||||
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time,
|
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
|
||||||
(memsize * 1e-6) / memcpy_h2d_time);
|
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
|
||||||
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time,
|
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);
|
||||||
(memsize * 1e-6) / memcpy_d2h_time);
|
|
||||||
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time,
|
|
||||||
(inner_reps * memsize * 2e-6) / kernel_time);
|
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf(
|
printf("Theoretical limits for speedup gained from overlapped data "
|
||||||
"Theoretical limits for speedup gained from overlapped data "
|
|
||||||
"transfers:\n");
|
"transfers:\n");
|
||||||
printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
|
printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
|
||||||
memcpy_h2d_time + memcpy_d2h_time + kernel_time);
|
printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
|
||||||
printf("Compute can overlap with one transfer: %f ms\n",
|
|
||||||
max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
|
|
||||||
printf("Compute can overlap with both data transfers: %f ms\n",
|
printf("Compute can overlap with both data transfers: %f ms\n",
|
||||||
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
|
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
|
||||||
|
|
||||||
@ -254,18 +247,13 @@ int main(int argc, char *argv[]) {
|
|||||||
float overlap_time = processWithStreams(STREAM_COUNT);
|
float overlap_time = processWithStreams(STREAM_COUNT);
|
||||||
|
|
||||||
printf("\nAverage measured timings over %d repetitions:\n", nreps);
|
printf("\nAverage measured timings over %d repetitions:\n", nreps);
|
||||||
printf(" Avg. time when execution fully serialized\t: %f ms\n",
|
printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
|
||||||
serial_time / nreps);
|
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
|
||||||
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
|
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);
|
||||||
overlap_time / nreps);
|
|
||||||
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
|
|
||||||
(serial_time - overlap_time) / nreps);
|
|
||||||
|
|
||||||
printf("\nMeasured throughput:\n");
|
printf("\nMeasured throughput:\n");
|
||||||
printf(" Fully serialized execution\t\t: %f GB/s\n",
|
printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
|
||||||
(nreps * (memsize * 2e-6)) / serial_time);
|
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);
|
||||||
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
|
|
||||||
(nreps * (memsize * 2e-6)) / overlap_time);
|
|
||||||
|
|
||||||
// Verify the results, we will use the results for final output
|
// Verify the results, we will use the results for final output
|
||||||
bool bResults = test();
|
bool bResults = test();
|
||||||
@ -293,7 +281,8 @@ int main(int argc, char *argv[]) {
|
|||||||
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
float processWithStreams(int streams_used) {
|
float processWithStreams(int streams_used)
|
||||||
|
{
|
||||||
int current_stream = 0;
|
int current_stream = 0;
|
||||||
|
|
||||||
float time;
|
float time;
|
||||||
@ -326,17 +315,17 @@ float processWithStreams(int streams_used) {
|
|||||||
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
|
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
|
||||||
|
|
||||||
// Upload next frame
|
// Upload next frame
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyAsync(
|
||||||
cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize,
|
d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));
|
||||||
cudaMemcpyHostToDevice, stream[next_stream]));
|
|
||||||
|
|
||||||
// Download current frame
|
// Download current frame
|
||||||
checkCudaErrors(cudaMemcpyAsync(
|
checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
|
||||||
h_data_out[current_stream], d_data_out[current_stream], memsize,
|
d_data_out[current_stream],
|
||||||
cudaMemcpyDeviceToHost, stream[current_stream]));
|
memsize,
|
||||||
|
cudaMemcpyDeviceToHost,
|
||||||
|
stream[current_stream]));
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
|
||||||
cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
|
|
||||||
|
|
||||||
current_stream = next_stream;
|
current_stream = next_stream;
|
||||||
}
|
}
|
||||||
@ -350,7 +339,8 @@ float processWithStreams(int streams_used) {
|
|||||||
return time;
|
return time;
|
||||||
}
|
}
|
||||||
|
|
||||||
void init() {
|
void init()
|
||||||
|
{
|
||||||
for (int i = 0; i < N; ++i) {
|
for (int i = 0; i < N; ++i) {
|
||||||
h_data_source[i] = 0;
|
h_data_source[i] = 0;
|
||||||
}
|
}
|
||||||
@ -360,7 +350,8 @@ void init() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool test() {
|
bool test()
|
||||||
|
{
|
||||||
bool passed = true;
|
bool passed = true;
|
||||||
|
|
||||||
for (int j = 0; j < STREAM_COUNT; ++j) {
|
for (int j = 0; j < STREAM_COUNT; ++j) {
|
||||||
|
|||||||
@ -37,15 +37,15 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(a, b) (a > b ? a : b)
|
#define MAX(a, b) (a > b ? a : b)
|
||||||
@ -64,12 +64,14 @@ const int DATA_N = 1048576 * 32;
|
|||||||
// Refer to the 'reduction' CUDA Sample describing
|
// Refer to the 'reduction' CUDA Sample describing
|
||||||
// reduction optimization strategies
|
// reduction optimization strategies
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
|
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
|
||||||
|
{
|
||||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
const int threadN = gridDim.x * blockDim.x;
|
const int threadN = gridDim.x * blockDim.x;
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];
|
for (int pos = tid; pos < N; pos += threadN)
|
||||||
|
sum += d_Input[pos];
|
||||||
|
|
||||||
d_Result[tid] = sum;
|
d_Result[tid] = sum;
|
||||||
}
|
}
|
||||||
@ -77,7 +79,8 @@ __global__ static void reduceKernel(float *d_Result, float *d_Input, int N) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
// Solver config
|
// Solver config
|
||||||
TGPUplan plan[MAX_GPU_COUNT];
|
TGPUplan plan[MAX_GPU_COUNT];
|
||||||
|
|
||||||
@ -129,14 +132,10 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaSetDevice(i));
|
checkCudaErrors(cudaSetDevice(i));
|
||||||
checkCudaErrors(cudaStreamCreate(&plan[i].stream));
|
checkCudaErrors(cudaStreamCreate(&plan[i].stream));
|
||||||
// Allocate memory
|
// Allocate memory
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
|
||||||
cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
|
checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
|
||||||
cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
|
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
|
||||||
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
|
|
||||||
ACCUM_N * sizeof(float)));
|
|
||||||
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
|
|
||||||
plan[i].dataN * sizeof(float)));
|
|
||||||
|
|
||||||
for (j = 0; j < plan[i].dataN; j++) {
|
for (j = 0; j < plan[i].dataN; j++) {
|
||||||
plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
|
plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
|
||||||
@ -158,19 +157,16 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaSetDevice(i));
|
checkCudaErrors(cudaSetDevice(i));
|
||||||
|
|
||||||
// Copy input data from CPU
|
// Copy input data from CPU
|
||||||
checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data,
|
checkCudaErrors(cudaMemcpyAsync(
|
||||||
plan[i].dataN * sizeof(float),
|
plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
|
||||||
cudaMemcpyHostToDevice, plan[i].stream));
|
|
||||||
|
|
||||||
// Perform GPU computations
|
// Perform GPU computations
|
||||||
reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(
|
reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
|
||||||
plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
|
|
||||||
getLastCudaError("reduceKernel() execution failed.\n");
|
getLastCudaError("reduceKernel() execution failed.\n");
|
||||||
|
|
||||||
// Read back GPU results
|
// Read back GPU results
|
||||||
checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum,
|
checkCudaErrors(cudaMemcpyAsync(
|
||||||
ACCUM_N * sizeof(float),
|
plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
|
||||||
cudaMemcpyDeviceToHost, plan[i].stream));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process GPU results
|
// Process GPU results
|
||||||
|
|||||||
@ -37,7 +37,8 @@
|
|||||||
#ifndef SIMPLEMULTIGPU_H
|
#ifndef SIMPLEMULTIGPU_H
|
||||||
#define SIMPLEMULTIGPU_H
|
#define SIMPLEMULTIGPU_H
|
||||||
|
|
||||||
typedef struct {
|
typedef struct
|
||||||
|
{
|
||||||
// Host-side input data
|
// Host-side input data
|
||||||
int dataN;
|
int dataN;
|
||||||
float *h_Data;
|
float *h_Data;
|
||||||
@ -56,7 +57,6 @@ typedef struct {
|
|||||||
|
|
||||||
} TGPUplan;
|
} TGPUplan;
|
||||||
|
|
||||||
extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N,
|
extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
|
||||||
int BLOCK_N, int THREAD_N, cudaStream_t &s);
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -25,8 +25,8 @@
|
|||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
const int manualBlockSize = 32;
|
const int manualBlockSize = 32;
|
||||||
|
|
||||||
@ -38,7 +38,8 @@ const int manualBlockSize = 32;
|
|||||||
// execution configuration, including anything the launch configurator
|
// execution configuration, including anything the launch configurator
|
||||||
// API suggests.
|
// API suggests.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void square(int *array, int arrayCount) {
|
__global__ void square(int *array, int arrayCount)
|
||||||
|
{
|
||||||
extern __shared__ int dynamicSmem[];
|
extern __shared__ int dynamicSmem[];
|
||||||
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
|
||||||
@ -58,8 +59,8 @@ __global__ void square(int *array, int arrayCount) {
|
|||||||
// This wrapper routine computes the occupancy of kernel, and reports
|
// This wrapper routine computes the occupancy of kernel, and reports
|
||||||
// it in terms of active warps / maximum warps per SM.
|
// it in terms of active warps / maximum warps per SM.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static double reportPotentialOccupancy(void *kernel, int blockSize,
|
static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
|
||||||
size_t dynamicSMem) {
|
{
|
||||||
int device;
|
int device;
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
|
|
||||||
@ -72,8 +73,7 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
|
|||||||
checkCudaErrors(cudaGetDevice(&device));
|
checkCudaErrors(cudaGetDevice(&device));
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&prop, device));
|
checkCudaErrors(cudaGetDeviceProperties(&prop, device));
|
||||||
|
|
||||||
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));
|
||||||
&numBlocks, kernel, blockSize, dynamicSMem));
|
|
||||||
|
|
||||||
activeWarps = numBlocks * blockSize / prop.warpSize;
|
activeWarps = numBlocks * blockSize / prop.warpSize;
|
||||||
maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
|
maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
|
||||||
@ -99,7 +99,8 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
|
|||||||
// This function configures the launch based on the "automatic"
|
// This function configures the launch based on the "automatic"
|
||||||
// argument, records the runtime, and reports occupancy and runtime.
|
// argument, records the runtime, and reports occupancy and runtime.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static int launchConfig(int *array, int arrayCount, bool automatic) {
|
static int launchConfig(int *array, int arrayCount, bool automatic)
|
||||||
|
{
|
||||||
int blockSize;
|
int blockSize;
|
||||||
int minGridSize;
|
int minGridSize;
|
||||||
int gridSize;
|
int gridSize;
|
||||||
@ -116,14 +117,13 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
|
|||||||
checkCudaErrors(cudaEventCreate(&end));
|
checkCudaErrors(cudaEventCreate(&end));
|
||||||
|
|
||||||
if (automatic) {
|
if (automatic) {
|
||||||
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
|
checkCudaErrors(
|
||||||
&minGridSize, &blockSize, (void *)square, dynamicSMemUsage,
|
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));
|
||||||
arrayCount));
|
|
||||||
|
|
||||||
std::cout << "Suggested block size: " << blockSize << std::endl
|
std::cout << "Suggested block size: " << blockSize << std::endl
|
||||||
<< "Minimum grid size for maximum occupancy: " << minGridSize
|
<< "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
|
||||||
<< std::endl;
|
}
|
||||||
} else {
|
else {
|
||||||
// This block size is too small. Given limited number of
|
// This block size is too small. Given limited number of
|
||||||
// active blocks per multiprocessor, the number of active
|
// active blocks per multiprocessor, the number of active
|
||||||
// threads will be limited, and thus unable to achieve maximum
|
// threads will be limited, and thus unable to achieve maximum
|
||||||
@ -146,11 +146,9 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
|
|||||||
|
|
||||||
// Calculate occupancy
|
// Calculate occupancy
|
||||||
//
|
//
|
||||||
potentialOccupancy =
|
potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
|
||||||
reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
|
|
||||||
|
|
||||||
std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%"
|
std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;
|
||||||
<< std::endl;
|
|
||||||
|
|
||||||
// Report elapsed time
|
// Report elapsed time
|
||||||
//
|
//
|
||||||
@ -166,7 +164,8 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
|
|||||||
// The test generates an array and squares it with a CUDA kernel, then
|
// The test generates an array and squares it with a CUDA kernel, then
|
||||||
// verifies the result.
|
// verifies the result.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static int test(bool automaticLaunchConfig, const int count = 1000000) {
|
static int test(bool automaticLaunchConfig, const int count = 1000000)
|
||||||
|
{
|
||||||
int *array;
|
int *array;
|
||||||
int *dArray;
|
int *dArray;
|
||||||
int size = count * sizeof(int);
|
int size = count * sizeof(int);
|
||||||
@ -193,8 +192,7 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
|
|||||||
//
|
//
|
||||||
for (int i = 0; i < count; i += 1) {
|
for (int i = 0; i < count; i += 1) {
|
||||||
if (array[i] != i * i) {
|
if (array[i] != i * i) {
|
||||||
std::cout << "element " << i << " expected " << i * i << " actual "
|
std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
|
||||||
<< array[i] << std::endl;
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -210,13 +208,13 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
|
|||||||
// automatically configured launch, and reports the occupancy and
|
// automatically configured launch, and reports the occupancy and
|
||||||
// performance.
|
// performance.
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main() {
|
int main()
|
||||||
|
{
|
||||||
int status;
|
int status;
|
||||||
|
|
||||||
std::cout << "starting Simple Occupancy" << std::endl << std::endl;
|
std::cout << "starting Simple Occupancy" << std::endl << std::endl;
|
||||||
|
|
||||||
std::cout << "[ Manual configuration with " << manualBlockSize
|
std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;
|
||||||
<< " threads per block ]" << std::endl;
|
|
||||||
|
|
||||||
status = test(false);
|
status = test(false);
|
||||||
if (status) {
|
if (status) {
|
||||||
|
|||||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
|
|||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -31,8 +31,8 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
// CUDA includes
|
// CUDA includes
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@ -41,7 +41,8 @@
|
|||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
||||||
|
|
||||||
__global__ void SimpleKernel(float *src, float *dst) {
|
__global__ void SimpleKernel(float *src, float *dst)
|
||||||
|
{
|
||||||
// Just a dummy kernel, doing enough for us to verify that everything
|
// Just a dummy kernel, doing enough for us to verify that everything
|
||||||
// worked
|
// worked
|
||||||
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
@ -50,12 +51,12 @@ __global__ void SimpleKernel(float *src, float *dst) {
|
|||||||
|
|
||||||
inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
|
inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("[%s] - Starting...\n", argv[0]);
|
printf("[%s] - Starting...\n", argv[0]);
|
||||||
|
|
||||||
if (!IsAppBuiltAs64()) {
|
if (!IsAppBuiltAs64()) {
|
||||||
printf(
|
printf("%s is only supported with on 64-bit OSs and the application must be "
|
||||||
"%s is only supported with on 64-bit OSs and the application must be "
|
|
||||||
"built as a 64-bit target. Test is being waived.\n",
|
"built as a 64-bit target. Test is being waived.\n",
|
||||||
argv[0]);
|
argv[0]);
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -68,8 +69,7 @@ int main(int argc, char **argv) {
|
|||||||
printf("CUDA-capable device count: %i\n", gpu_n);
|
printf("CUDA-capable device count: %i\n", gpu_n);
|
||||||
|
|
||||||
if (gpu_n < 2) {
|
if (gpu_n < 2) {
|
||||||
printf(
|
printf("Two or more GPUs with Peer-to-Peer access capability are required for "
|
||||||
"Two or more GPUs with Peer-to-Peer access capability are required for "
|
|
||||||
"%s.\n",
|
"%s.\n",
|
||||||
argv[0]);
|
argv[0]);
|
||||||
printf("Waiving test.\n");
|
printf("Waiving test.\n");
|
||||||
@ -97,8 +97,12 @@ int main(int argc, char **argv) {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
|
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
|
||||||
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
|
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
|
||||||
i, prop[j].name, j, can_access_peer ? "Yes" : "No");
|
prop[i].name,
|
||||||
|
i,
|
||||||
|
prop[j].name,
|
||||||
|
j,
|
||||||
|
can_access_peer ? "Yes" : "No");
|
||||||
if (can_access_peer && p2pCapableGPUs[0] == -1) {
|
if (can_access_peer && p2pCapableGPUs[0] == -1) {
|
||||||
p2pCapableGPUs[0] = i;
|
p2pCapableGPUs[0] = i;
|
||||||
p2pCapableGPUs[1] = j;
|
p2pCapableGPUs[1] = j;
|
||||||
@ -107,12 +111,10 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
|
if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
|
||||||
printf(
|
printf("Two or more GPUs with Peer-to-Peer access capability are required for "
|
||||||
"Two or more GPUs with Peer-to-Peer access capability are required for "
|
|
||||||
"%s.\n",
|
"%s.\n",
|
||||||
argv[0]);
|
argv[0]);
|
||||||
printf(
|
printf("Peer to Peer access is not available amongst GPUs in the system, "
|
||||||
"Peer to Peer access is not available amongst GPUs in the system, "
|
|
||||||
"waiving test.\n");
|
"waiving test.\n");
|
||||||
|
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -123,8 +125,7 @@ int main(int argc, char **argv) {
|
|||||||
gpuid[1] = p2pCapableGPUs[1];
|
gpuid[1] = p2pCapableGPUs[1];
|
||||||
|
|
||||||
// Enable peer access
|
// Enable peer access
|
||||||
printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
|
printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
|
||||||
gpuid[1]);
|
|
||||||
checkCudaErrors(cudaSetDevice(gpuid[0]));
|
checkCudaErrors(cudaSetDevice(gpuid[0]));
|
||||||
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
|
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
|
||||||
checkCudaErrors(cudaSetDevice(gpuid[1]));
|
checkCudaErrors(cudaSetDevice(gpuid[1]));
|
||||||
@ -132,8 +133,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
// Allocate buffers
|
// Allocate buffers
|
||||||
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
|
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
|
||||||
printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
|
printf(
|
||||||
int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
|
"Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
|
||||||
checkCudaErrors(cudaSetDevice(gpuid[0]));
|
checkCudaErrors(cudaSetDevice(gpuid[0]));
|
||||||
float *g0;
|
float *g0;
|
||||||
checkCudaErrors(cudaMalloc(&g0, buf_size));
|
checkCudaErrors(cudaMalloc(&g0, buf_size));
|
||||||
@ -141,8 +142,7 @@ int main(int argc, char **argv) {
|
|||||||
float *g1;
|
float *g1;
|
||||||
checkCudaErrors(cudaMalloc(&g1, buf_size));
|
checkCudaErrors(cudaMalloc(&g1, buf_size));
|
||||||
float *h0;
|
float *h0;
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
|
||||||
cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
|
|
||||||
|
|
||||||
// Create CUDA event handles
|
// Create CUDA event handles
|
||||||
printf("Creating event handles...\n");
|
printf("Creating event handles...\n");
|
||||||
@ -161,7 +161,8 @@ int main(int argc, char **argv) {
|
|||||||
// Ping-pong copy between GPUs
|
// Ping-pong copy between GPUs
|
||||||
if (i % 2 == 0) {
|
if (i % 2 == 0) {
|
||||||
checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
|
checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
|
checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -170,9 +171,9 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaEventSynchronize(stop_event));
|
checkCudaErrors(cudaEventSynchronize(stop_event));
|
||||||
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
|
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
|
||||||
printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
|
printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
|
||||||
gpuid[0], gpuid[1],
|
gpuid[0],
|
||||||
(1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
|
gpuid[1],
|
||||||
1024.0f / 1024.0f);
|
(1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
|
||||||
|
|
||||||
// Prepare host buffer and copy to GPU 0
|
// Prepare host buffer and copy to GPU 0
|
||||||
printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
|
printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
|
||||||
@ -190,10 +191,11 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
|
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
|
||||||
// output to the GPU 1 buffer
|
// output to the GPU 1 buffer
|
||||||
printf(
|
printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
|
||||||
"Run kernel on GPU%d, taking source data from GPU%d and writing to "
|
|
||||||
"GPU%d...\n",
|
"GPU%d...\n",
|
||||||
gpuid[1], gpuid[0], gpuid[1]);
|
gpuid[1],
|
||||||
|
gpuid[0],
|
||||||
|
gpuid[1]);
|
||||||
checkCudaErrors(cudaSetDevice(gpuid[1]));
|
checkCudaErrors(cudaSetDevice(gpuid[1]));
|
||||||
SimpleKernel<<<blocks, threads>>>(g0, g1);
|
SimpleKernel<<<blocks, threads>>>(g0, g1);
|
||||||
|
|
||||||
@ -201,10 +203,11 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
|
// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
|
||||||
// output to the GPU 0 buffer
|
// output to the GPU 0 buffer
|
||||||
printf(
|
printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
|
||||||
"Run kernel on GPU%d, taking source data from GPU%d and writing to "
|
|
||||||
"GPU%d...\n",
|
"GPU%d...\n",
|
||||||
gpuid[0], gpuid[1], gpuid[0]);
|
gpuid[0],
|
||||||
|
gpuid[1],
|
||||||
|
gpuid[0]);
|
||||||
checkCudaErrors(cudaSetDevice(gpuid[0]));
|
checkCudaErrors(cudaSetDevice(gpuid[0]));
|
||||||
SimpleKernel<<<blocks, threads>>>(g1, g0);
|
SimpleKernel<<<blocks, threads>>>(g1, g0);
|
||||||
|
|
||||||
@ -220,8 +223,7 @@ int main(int argc, char **argv) {
|
|||||||
// Re-generate input data and apply 2x '* 2.0f' computation of both
|
// Re-generate input data and apply 2x '* 2.0f' computation of both
|
||||||
// kernel runs
|
// kernel runs
|
||||||
if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
|
if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
|
||||||
printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
|
printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
|
||||||
(float(i % 4096) * 2.0f * 2.0f));
|
|
||||||
|
|
||||||
if (error_count++ > 10) {
|
if (error_count++ > 10) {
|
||||||
break;
|
break;
|
||||||
@ -253,7 +255,8 @@ int main(int argc, char **argv) {
|
|||||||
if (error_count != 0) {
|
if (error_count != 0) {
|
||||||
printf("Test failed!\n");
|
printf("Test failed!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Test passed\n");
|
printf("Test passed\n");
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -70,29 +70,26 @@ bool bTestResult = true;
|
|||||||
//! Shifts matrix elements using pitch linear array
|
//! Shifts matrix elements using pitch linear array
|
||||||
//! @param odata output data in global memory
|
//! @param odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height,
|
__global__ void
|
||||||
int shiftX, int shiftY,
|
shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
|
||||||
cudaTextureObject_t texRefPL) {
|
{
|
||||||
int xid = blockIdx.x * blockDim.x + threadIdx.x;
|
int xid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int yid = blockIdx.y * blockDim.y + threadIdx.y;
|
int yid = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
odata[yid * pitch + xid] = tex2D<float>(
|
odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
|
||||||
texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Shifts matrix elements using regular array
|
//! Shifts matrix elements using regular array
|
||||||
//! @param odata output data in global memory
|
//! @param odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void shiftArray(float *odata, int pitch, int width, int height,
|
__global__ void
|
||||||
int shiftX, int shiftY,
|
shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
|
||||||
cudaTextureObject_t texRefArray) {
|
{
|
||||||
int xid = blockIdx.x * blockDim.x + threadIdx.x;
|
int xid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
int yid = blockIdx.y * blockDim.y + threadIdx.y;
|
int yid = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
odata[yid * pitch + xid] =
|
odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
|
||||||
tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
|
|
||||||
(yid + shiftY) / (float)height);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -102,20 +99,21 @@ void runTest(int argc, char **argv);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n\n", sSDKsample);
|
printf("%s starting...\n\n", sSDKsample);
|
||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sSDKsample,
|
printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
|
||||||
bTestResult ? "OK" : "ERROR!");
|
|
||||||
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
// Set array size
|
// Set array size
|
||||||
const int nx = 2048;
|
const int nx = 2048;
|
||||||
const int ny = 2048;
|
const int ny = 2048;
|
||||||
@ -154,8 +152,7 @@ void runTest(int argc, char **argv) {
|
|||||||
float *d_idataPL;
|
float *d_idataPL;
|
||||||
size_t d_pitchBytes;
|
size_t d_pitchBytes;
|
||||||
|
|
||||||
checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
|
checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));
|
||||||
nx * sizeof(float), ny));
|
|
||||||
|
|
||||||
// Array input data
|
// Array input data
|
||||||
cudaArray *d_idataArray;
|
cudaArray *d_idataArray;
|
||||||
@ -165,20 +162,17 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
// Pitch linear output data
|
// Pitch linear output data
|
||||||
float *d_odata;
|
float *d_odata;
|
||||||
checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
|
checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));
|
||||||
nx * sizeof(float), ny));
|
|
||||||
|
|
||||||
// Copy host data to device
|
// Copy host data to device
|
||||||
// Pitch linear
|
// Pitch linear
|
||||||
size_t h_pitchBytes = nx * sizeof(float);
|
size_t h_pitchBytes = nx * sizeof(float);
|
||||||
|
|
||||||
checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
|
checkCudaErrors(
|
||||||
nx * sizeof(float), ny, cudaMemcpyHostToDevice));
|
cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));
|
||||||
|
|
||||||
// Array
|
// Array
|
||||||
checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
|
checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));
|
||||||
nx * ny * sizeof(float),
|
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
cudaTextureObject_t texRefPL;
|
cudaTextureObject_t texRefPL;
|
||||||
cudaTextureObject_t texRefArray;
|
cudaTextureObject_t texRefArray;
|
||||||
@ -210,8 +204,7 @@ void runTest(int argc, char **argv) {
|
|||||||
texDescr.addressMode[0] = cudaAddressModeWrap;
|
texDescr.addressMode[0] = cudaAddressModeWrap;
|
||||||
texDescr.addressMode[1] = cudaAddressModeWrap;
|
texDescr.addressMode[1] = cudaAddressModeWrap;
|
||||||
texDescr.readMode = cudaReadModeElementType;
|
texDescr.readMode = cudaReadModeElementType;
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
|
||||||
cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
|
|
||||||
|
|
||||||
// Reference calculation
|
// Reference calculation
|
||||||
for (int j = 0; j < ny; ++j) {
|
for (int j = 0; j < ny; ++j) {
|
||||||
@ -224,15 +217,13 @@ void runTest(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Run ShiftPitchLinear kernel
|
// Run ShiftPitchLinear kernel
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
|
||||||
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
|
|
||||||
|
|
||||||
checkCudaErrors(cudaEventRecord(start, 0));
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
|
|
||||||
for (int i = 0; i < NUM_REPS; ++i) {
|
for (int i = 0; i < NUM_REPS; ++i) {
|
||||||
shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata,
|
shiftPitchLinear<<<dimGrid, dimBlock>>>(
|
||||||
(int)(d_pitchBytes / sizeof(float)),
|
d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
|
||||||
nx, ny, x_shift, y_shift, texRefPL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaEventRecord(stop, 0));
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
@ -241,8 +232,8 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
|
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
|
||||||
|
|
||||||
// Check results
|
// Check results
|
||||||
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
|
checkCudaErrors(
|
||||||
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
|
cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
|
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
|
||||||
|
|
||||||
@ -254,14 +245,12 @@ void runTest(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Run ShiftArray kernel
|
// Run ShiftArray kernel
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
|
||||||
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
|
|
||||||
checkCudaErrors(cudaEventRecord(start, 0));
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
|
|
||||||
for (int i = 0; i < NUM_REPS; ++i) {
|
for (int i = 0; i < NUM_REPS; ++i) {
|
||||||
shiftArray<<<dimGrid, dimBlock>>>(d_odata,
|
shiftArray<<<dimGrid, dimBlock>>>(
|
||||||
(int)(d_pitchBytes / sizeof(float)), nx,
|
d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
|
||||||
ny, x_shift, y_shift, texRefArray);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaEventRecord(stop, 0));
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
@ -270,8 +259,8 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
|
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
|
||||||
|
|
||||||
// Check results
|
// Check results
|
||||||
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes,
|
checkCudaErrors(
|
||||||
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
|
cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
|
||||||
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
|
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
|
||||||
|
|
||||||
if (res == false) {
|
if (res == false) {
|
||||||
@ -279,21 +268,18 @@ void runTest(int argc, char **argv) {
|
|||||||
bTestResult = false;
|
bTestResult = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
float bandwidthPL =
|
float bandwidthPL = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
|
||||||
2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
|
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
|
||||||
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) /
|
|
||||||
(timeArray / NUM_REPS);
|
|
||||||
|
|
||||||
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n",
|
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
|
||||||
bandwidthPL, bandwidthArray);
|
|
||||||
|
|
||||||
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
|
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
|
||||||
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
|
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
|
||||||
|
|
||||||
printf(
|
printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
|
||||||
"\nTexture fetch rate (Mpix/s) for pitch linear: "
|
|
||||||
"%.2e; for array: %.2e\n\n",
|
"%.2e; for array: %.2e\n\n",
|
||||||
fetchRatePL, fetchRateArray);
|
fetchRatePL,
|
||||||
|
fetchRateArray);
|
||||||
|
|
||||||
// Cleanup
|
// Cleanup
|
||||||
free(h_idata);
|
free(h_idata);
|
||||||
|
|||||||
@ -26,28 +26,30 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(a, b) (a > b ? a : b)
|
#define MAX(a, b) (a > b ? a : b)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
__global__ void testKernel(int val) {
|
__global__ void testKernel(int val)
|
||||||
printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x,
|
{
|
||||||
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
|
printf("[%d, %d]:\t\tValue is:%d\n",
|
||||||
threadIdx.x,
|
blockIdx.y * gridDim.x + blockIdx.x,
|
||||||
|
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
|
||||||
val);
|
val);
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int devID;
|
int devID;
|
||||||
cudaDeviceProp props;
|
cudaDeviceProp props;
|
||||||
|
|
||||||
@ -57,8 +59,7 @@ int main(int argc, char **argv) {
|
|||||||
// Get GPU information
|
// Get GPU information
|
||||||
checkCudaErrors(cudaGetDevice(&devID));
|
checkCudaErrors(cudaGetDevice(&devID));
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&props, devID));
|
||||||
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name,
|
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);
|
||||||
props.major, props.minor);
|
|
||||||
|
|
||||||
printf("printf() is called. Output:\n\n");
|
printf("printf() is called. Output:\n\n");
|
||||||
|
|
||||||
|
|||||||
@ -48,24 +48,25 @@
|
|||||||
|
|
||||||
const char *sSDKsample = "simpleStreams";
|
const char *sSDKsample = "simpleStreams";
|
||||||
|
|
||||||
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync",
|
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};
|
||||||
"cudaEventDisableTiming", NULL};
|
|
||||||
|
|
||||||
const char *sDeviceSyncMethod[] = {
|
const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
|
||||||
"cudaDeviceScheduleAuto", "cudaDeviceScheduleSpin",
|
"cudaDeviceScheduleSpin",
|
||||||
"cudaDeviceScheduleYield", "INVALID",
|
"cudaDeviceScheduleYield",
|
||||||
"cudaDeviceScheduleBlockingSync", NULL};
|
"INVALID",
|
||||||
|
"cudaDeviceScheduleBlockingSync",
|
||||||
|
NULL};
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#ifndef WIN32
|
#ifndef WIN32
|
||||||
#include <sys/mman.h> // for mmap() / munmap()
|
#include <sys/mman.h> // for mmap() / munmap()
|
||||||
@ -75,7 +76,8 @@ const char *sDeviceSyncMethod[] = {
|
|||||||
#define MEMORY_ALIGNMENT 4096
|
#define MEMORY_ALIGNMENT 4096
|
||||||
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
|
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
|
||||||
|
|
||||||
__global__ void init_array(int *g_data, int *factor, int num_iterations) {
|
__global__ void init_array(int *g_data, int *factor, int num_iterations)
|
||||||
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
for (int i = 0; i < num_iterations; i++) {
|
for (int i = 0; i < num_iterations; i++) {
|
||||||
@ -83,7 +85,8 @@ __global__ void init_array(int *g_data, int *factor, int num_iterations) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool correct_data(int *a, const int n, const int c) {
|
bool correct_data(int *a, const int n, const int c)
|
||||||
|
{
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
if (a[i] != c) {
|
if (a[i] != c) {
|
||||||
printf("%d: %d %d\n", i, a[i], c);
|
printf("%d: %d %d\n", i, a[i], c);
|
||||||
@ -94,51 +97,45 @@ bool correct_data(int *a, const int n, const int c) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a,
|
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
|
||||||
int **ppAligned_a, int nbytes) {
|
{
|
||||||
#if CUDART_VERSION >= 4000
|
#if CUDART_VERSION >= 4000
|
||||||
#if !defined(__arm__) && !defined(__aarch64__)
|
#if !defined(__arm__) && !defined(__aarch64__)
|
||||||
if (bPinGenericMemory) {
|
if (bPinGenericMemory) {
|
||||||
// allocate a generic page-aligned chunk of system memory
|
// allocate a generic page-aligned chunk of system memory
|
||||||
#ifdef WIN32
|
#ifdef WIN32
|
||||||
printf(
|
printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
|
||||||
"> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
|
|
||||||
"system memory)\n",
|
"system memory)\n",
|
||||||
(float)nbytes / 1048576.0f);
|
(float)nbytes / 1048576.0f);
|
||||||
*pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
|
*pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
|
||||||
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
|
|
||||||
#else
|
#else
|
||||||
printf(
|
printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
|
||||||
"> mmap() allocating %4.2f Mbytes (generic page-aligned system "
|
|
||||||
"memory)\n",
|
"memory)\n",
|
||||||
(float)nbytes / 1048576.0f);
|
(float)nbytes / 1048576.0f);
|
||||||
*pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
|
*pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
||||||
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
|
*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
|
||||||
|
|
||||||
printf(
|
printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
|
||||||
"> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
|
|
||||||
"system memory\n",
|
"system memory\n",
|
||||||
(float)nbytes / 1048576.0f);
|
(float)nbytes / 1048576.0f);
|
||||||
// pin allocate memory
|
// pin allocate memory
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
|
||||||
cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
|
}
|
||||||
} else
|
else
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n",
|
printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
|
||||||
(float)nbytes / 1048576.0f);
|
|
||||||
// allocate host memory (pinned is required for achieve asynchronicity)
|
// allocate host memory (pinned is required for achieve asynchronicity)
|
||||||
checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
|
checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
|
||||||
*ppAligned_a = *pp_a;
|
*ppAligned_a = *pp_a;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
|
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
|
||||||
int **ppAligned_a, int nbytes) {
|
{
|
||||||
#if CUDART_VERSION >= 4000
|
#if CUDART_VERSION >= 4000
|
||||||
#if !defined(__arm__) && !defined(__aarch64__)
|
#if !defined(__arm__) && !defined(__aarch64__)
|
||||||
// CUDA 4.0 support pinning of generic host memory
|
// CUDA 4.0 support pinning of generic host memory
|
||||||
@ -150,7 +147,8 @@ inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
|
|||||||
#else
|
#else
|
||||||
munmap(*pp_a, nbytes);
|
munmap(*pp_a, nbytes);
|
||||||
#endif
|
#endif
|
||||||
} else
|
}
|
||||||
|
else
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
@ -158,26 +156,24 @@ inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char *sSyncMethod[] = {
|
static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
|
||||||
"0 (Automatic Blocking)",
|
|
||||||
"1 (Spin Blocking)",
|
"1 (Spin Blocking)",
|
||||||
"2 (Yield Blocking)",
|
"2 (Yield Blocking)",
|
||||||
"3 (Undefined Blocking Method)",
|
"3 (Undefined Blocking Method)",
|
||||||
"4 (Blocking Sync Event) = low CPU utilization",
|
"4 (Blocking Sync Event) = low CPU utilization",
|
||||||
NULL};
|
NULL};
|
||||||
|
|
||||||
void printHelp() {
|
void printHelp()
|
||||||
|
{
|
||||||
printf("Usage: %s [options below]\n", sSDKsample);
|
printf("Usage: %s [options below]\n", sSDKsample);
|
||||||
printf("\t--sync_method=n for CPU/GPU synchronization\n");
|
printf("\t--sync_method=n for CPU/GPU synchronization\n");
|
||||||
printf("\t n=%s\n", sSyncMethod[0]);
|
printf("\t n=%s\n", sSyncMethod[0]);
|
||||||
printf("\t n=%s\n", sSyncMethod[1]);
|
printf("\t n=%s\n", sSyncMethod[1]);
|
||||||
printf("\t n=%s\n", sSyncMethod[2]);
|
printf("\t n=%s\n", sSyncMethod[2]);
|
||||||
printf("\t <Default> n=%s\n", sSyncMethod[4]);
|
printf("\t <Default> n=%s\n", sSyncMethod[4]);
|
||||||
printf(
|
printf("\t--use_generic_memory (default) use generic page-aligned for system "
|
||||||
"\t--use_generic_memory (default) use generic page-aligned for system "
|
|
||||||
"memory\n");
|
"memory\n");
|
||||||
printf(
|
printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
|
||||||
"\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
|
|
||||||
"system memory\n");
|
"system memory\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -187,7 +183,8 @@ void printHelp() {
|
|||||||
#define DEFAULT_PINNED_GENERIC_MEMORY true
|
#define DEFAULT_PINNED_GENERIC_MEMORY true
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int cuda_device = 0;
|
int cuda_device = 0;
|
||||||
int nstreams = 4; // number of streams for CUDA calls
|
int nstreams = 4; // number of streams for CUDA calls
|
||||||
int nreps = 10; // number of times each experiment is repeated
|
int nreps = 10; // number of times each experiment is repeated
|
||||||
@ -199,10 +196,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
// allocate generic memory and pin it laster instead of using cudaHostAlloc()
|
// allocate generic memory and pin it laster instead of using cudaHostAlloc()
|
||||||
|
|
||||||
bool bPinGenericMemory =
|
bool bPinGenericMemory = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
|
||||||
DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
|
int device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync
|
||||||
int device_sync_method =
|
|
||||||
cudaDeviceBlockingSync; // by default we use BlockingSync
|
|
||||||
|
|
||||||
int niterations; // number of iterations for the loop inside the kernel
|
int niterations; // number of iterations for the loop inside the kernel
|
||||||
|
|
||||||
@ -213,20 +208,18 @@ int main(int argc, char **argv) {
|
|||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
|
if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
|
||||||
"sync_method")) >= 0) {
|
if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
|
||||||
if (device_sync_method == 0 || device_sync_method == 1 ||
|
printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
|
||||||
device_sync_method == 2 || device_sync_method == 4) {
|
|
||||||
printf("Device synchronization method set to = %s\n",
|
|
||||||
sSyncMethod[device_sync_method]);
|
|
||||||
printf("Setting reps to 100 to demonstrate steady state\n");
|
printf("Setting reps to 100 to demonstrate steady state\n");
|
||||||
nreps = 100;
|
nreps = 100;
|
||||||
} else {
|
}
|
||||||
printf("Invalid command line option sync_method=\"%d\"\n",
|
else {
|
||||||
device_sync_method);
|
printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printHelp();
|
printHelp();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -252,16 +245,13 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaGetDeviceCount(&num_devices));
|
checkCudaErrors(cudaGetDeviceCount(&num_devices));
|
||||||
|
|
||||||
if (0 == num_devices) {
|
if (0 == num_devices) {
|
||||||
printf(
|
printf("your system does not have a CUDA capable device, waiving test...\n");
|
||||||
"your system does not have a CUDA capable device, waiving test...\n");
|
|
||||||
return EXIT_WAIVED;
|
return EXIT_WAIVED;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the command-line chosen device ID is within range, exit if not
|
// check if the command-line chosen device ID is within range, exit if not
|
||||||
if (cuda_device >= num_devices) {
|
if (cuda_device >= num_devices) {
|
||||||
printf(
|
printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
|
||||||
"cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
|
|
||||||
cuda_device, num_devices - 1);
|
|
||||||
return EXIT_FAILURE;
|
return EXIT_FAILURE;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -276,12 +266,10 @@ int main(int argc, char **argv) {
|
|||||||
// Check if GPU can map host memory (Generic Method), if not then we override
|
// Check if GPU can map host memory (Generic Method), if not then we override
|
||||||
// bPinGenericMemory to be false
|
// bPinGenericMemory to be false
|
||||||
if (bPinGenericMemory) {
|
if (bPinGenericMemory) {
|
||||||
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
|
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
|
||||||
deviceProp.canMapHostMemory ? "Yes" : "No");
|
|
||||||
|
|
||||||
if (deviceProp.canMapHostMemory == 0) {
|
if (deviceProp.canMapHostMemory == 0) {
|
||||||
printf(
|
printf("Using cudaMallocHost, CUDA device does not support mapping of "
|
||||||
"Using cudaMallocHost, CUDA device does not support mapping of "
|
|
||||||
"generic host memory\n");
|
"generic host memory\n");
|
||||||
bPinGenericMemory = false;
|
bPinGenericMemory = false;
|
||||||
}
|
}
|
||||||
@ -289,27 +277,22 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
// Anything that is less than 32 Cores will have scaled down workload
|
// Anything that is less than 32 Cores will have scaled down workload
|
||||||
scale_factor =
|
scale_factor =
|
||||||
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
|
||||||
(float)deviceProp.multiProcessorCount)),
|
|
||||||
1.0f);
|
1.0f);
|
||||||
n = (int)rint((float)n / scale_factor);
|
n = (int)rint((float)n / scale_factor);
|
||||||
|
|
||||||
printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major,
|
printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
|
||||||
deviceProp.minor);
|
|
||||||
printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
|
printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
|
||||||
deviceProp.multiProcessorCount,
|
deviceProp.multiProcessorCount,
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
||||||
deviceProp.multiProcessorCount);
|
|
||||||
|
|
||||||
printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
|
printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
|
||||||
printf("> array_size = %d\n\n", n);
|
printf("> array_size = %d\n\n", n);
|
||||||
|
|
||||||
// enable use of blocking sync, to reduce CPU usage
|
// enable use of blocking sync, to reduce CPU usage
|
||||||
printf("> Using CPU/GPU Device Synchronization method (%s)\n",
|
printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
|
||||||
sDeviceSyncMethod[device_sync_method]);
|
checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
|
||||||
checkCudaErrors(cudaSetDeviceFlags(
|
|
||||||
device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
|
|
||||||
|
|
||||||
// allocate host memory
|
// allocate host memory
|
||||||
int c = 5; // value to which the array will be initialized
|
int c = 5; // value to which the array will be initialized
|
||||||
@ -332,8 +315,7 @@ int main(int argc, char **argv) {
|
|||||||
printf("\nStarting Test\n");
|
printf("\nStarting Test\n");
|
||||||
|
|
||||||
// allocate and initialize an array of stream handles
|
// allocate and initialize an array of stream handles
|
||||||
cudaStream_t *streams =
|
cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
|
||||||
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
|
|
||||||
|
|
||||||
for (int i = 0; i < nstreams; i++) {
|
for (int i = 0; i < nstreams; i++) {
|
||||||
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
checkCudaErrors(cudaStreamCreate(&(streams[i])));
|
||||||
@ -342,9 +324,7 @@ int main(int argc, char **argv) {
|
|||||||
// create CUDA event handles
|
// create CUDA event handles
|
||||||
// use blocking sync
|
// use blocking sync
|
||||||
cudaEvent_t start_event, stop_event;
|
cudaEvent_t start_event, stop_event;
|
||||||
int eventflags =
|
int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
|
||||||
((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync
|
|
||||||
: cudaEventDefault);
|
|
||||||
|
|
||||||
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
|
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
|
||||||
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
|
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
|
||||||
@ -354,11 +334,9 @@ int main(int argc, char **argv) {
|
|||||||
// ensure that all previous
|
// ensure that all previous
|
||||||
// CUDA calls have
|
// CUDA calls have
|
||||||
// completed
|
// completed
|
||||||
checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
|
checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
|
||||||
cudaMemcpyDeviceToHost, streams[0]));
|
|
||||||
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
||||||
checkCudaErrors(cudaEventSynchronize(
|
checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
|
||||||
stop_event)); // block until the event is actually recorded
|
|
||||||
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
|
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
|
||||||
printf("memcopy:\t%.2f\n", time_memcpy);
|
printf("memcopy:\t%.2f\n", time_memcpy);
|
||||||
|
|
||||||
@ -380,8 +358,7 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
for (int k = 0; k < nreps; k++) {
|
for (int k = 0; k < nreps; k++) {
|
||||||
init_array<<<blocks, threads>>>(d_a, d_c, niterations);
|
init_array<<<blocks, threads>>>(d_a, d_c, niterations);
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
checkCudaErrors(cudaEventRecord(stop_event, 0));
|
||||||
@ -395,16 +372,14 @@ int main(int argc, char **argv) {
|
|||||||
blocks = dim3(n / (nstreams * threads.x), 1);
|
blocks = dim3(n / (nstreams * threads.x), 1);
|
||||||
memset(hAligned_a, 255,
|
memset(hAligned_a, 255,
|
||||||
nbytes); // set host memory bits to all 1s, for testing correctness
|
nbytes); // set host memory bits to all 1s, for testing correctness
|
||||||
checkCudaErrors(cudaMemset(
|
checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
|
||||||
d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
|
|
||||||
checkCudaErrors(cudaEventRecord(start_event, 0));
|
checkCudaErrors(cudaEventRecord(start_event, 0));
|
||||||
|
|
||||||
for (int k = 0; k < nreps; k++) {
|
for (int k = 0; k < nreps; k++) {
|
||||||
// asynchronously launch nstreams kernels, each operating on its own portion
|
// asynchronously launch nstreams kernels, each operating on its own portion
|
||||||
// of data
|
// of data
|
||||||
for (int i = 0; i < nstreams; i++) {
|
for (int i = 0; i < nstreams; i++) {
|
||||||
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams,
|
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
|
||||||
d_c, niterations);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// asynchronously launch nstreams memcopies. Note that memcopy in stream x
|
// asynchronously launch nstreams memcopies. Note that memcopy in stream x
|
||||||
@ -413,8 +388,10 @@ int main(int argc, char **argv) {
|
|||||||
// completed
|
// completed
|
||||||
for (int i = 0; i < nstreams; i++) {
|
for (int i = 0; i < nstreams; i++) {
|
||||||
checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
|
checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
|
||||||
d_a + i * n / nstreams, nbytes / nstreams,
|
d_a + i * n / nstreams,
|
||||||
cudaMemcpyDeviceToHost, streams[i]));
|
nbytes / nstreams,
|
||||||
|
cudaMemcpyDeviceToHost,
|
||||||
|
streams[i]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,10 +34,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Includes, system
|
// Includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WINDOWS_LEAN_AND_MEAN
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
@ -73,23 +73,22 @@ static const char *sampleName = "simpleSurfaceWrite";
|
|||||||
//! Write to a cuArray (texture data source) using surface writes
|
//! Write to a cuArray (texture data source) using surface writes
|
||||||
//! @param gIData input data in global memory
|
//! @param gIData input data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void surfaceWriteKernel(float *gIData, int width, int height,
|
__global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
|
||||||
cudaSurfaceObject_t outputSurface) {
|
{
|
||||||
// calculate surface coordinates
|
// calculate surface coordinates
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
// read from global memory and write to cuarray (via surface reference)
|
// read from global memory and write to cuarray (via surface reference)
|
||||||
surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y,
|
surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
|
||||||
cudaBoundaryModeTrap);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Transform an image using texture lookups
|
//! Transform an image using texture lookups
|
||||||
//! @param gOData output data in global memory
|
//! @param gOData output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void transformKernel(float *gOData, int width, int height,
|
__global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
|
||||||
float theta, cudaTextureObject_t tex) {
|
{
|
||||||
// calculate normalized texture coordinates
|
// calculate normalized texture coordinates
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@ -111,29 +110,29 @@ __global__ void transformKernel(float *gOData, int width, int height,
|
|||||||
// Declaration, forward
|
// Declaration, forward
|
||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
|
|
||||||
extern "C" void computeGold(float *reference, float *idata,
|
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
|
||||||
const unsigned int len);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", sampleName);
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
// Process command-line arguments
|
// Process command-line arguments
|
||||||
if (argc > 1) {
|
if (argc > 1) {
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
|
||||||
getCmdLineArgumentString(argc, (const char **)argv, "input",
|
getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
|
||||||
(char **)&imageFilename);
|
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
||||||
getCmdLineArgumentString(argc, (const char **)argv, "reference",
|
getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
|
||||||
(char **)&refFilename);
|
}
|
||||||
} else {
|
else {
|
||||||
printf("-input flag should be used with -reference flag");
|
printf("-input flag should be used with -reference flag");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
} else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
}
|
||||||
|
else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
||||||
printf("-reference flag should be used with -input flag");
|
printf("-reference flag should be used with -input flag");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -141,15 +140,15 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
// Use command-line specified CUDA device,
|
// Use command-line specified CUDA device,
|
||||||
// otherwise use device with highest Gflops/s
|
// otherwise use device with highest Gflops/s
|
||||||
int devID = findCudaDevice(argc, (const char **)argv);
|
int devID = findCudaDevice(argc, (const char **)argv);
|
||||||
@ -159,7 +158,9 @@ void runTest(int argc, char **argv) {
|
|||||||
|
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||||
printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
|
printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
|
||||||
deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major,
|
deviceProps.name,
|
||||||
|
deviceProps.multiProcessorCount,
|
||||||
|
deviceProps.major,
|
||||||
deviceProps.minor);
|
deviceProps.minor);
|
||||||
|
|
||||||
// Load image from disk
|
// Load image from disk
|
||||||
@ -193,11 +194,9 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&dData, size));
|
checkCudaErrors(cudaMalloc((void **)&dData, size));
|
||||||
|
|
||||||
// Allocate array and copy image data
|
// Allocate array and copy image data
|
||||||
cudaChannelFormatDesc channelDesc =
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||||
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
|
||||||
cudaArray *cuArray;
|
cudaArray *cuArray;
|
||||||
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
|
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));
|
||||||
cudaArraySurfaceLoadStore));
|
|
||||||
|
|
||||||
dim3 dimBlock(8, 8, 1);
|
dim3 dimBlock(8, 8, 1);
|
||||||
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
|
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
|
||||||
@ -211,11 +210,9 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
|
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
|
||||||
#if 1
|
#if 1
|
||||||
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
|
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
|
||||||
surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height,
|
surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
|
||||||
outputSurface);
|
|
||||||
#else // This is what differs from the example simpleTexture
|
#else // This is what differs from the example simpleTexture
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
cudaTextureObject_t tex;
|
cudaTextureObject_t tex;
|
||||||
@ -254,8 +251,7 @@ void runTest(int argc, char **argv) {
|
|||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
printf("%.2f Mpixels/sec\n",
|
printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
|
||||||
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
|
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// Allocate mem for the result on host side
|
// Allocate mem for the result on host side
|
||||||
@ -272,9 +268,9 @@ void runTest(int argc, char **argv) {
|
|||||||
// Write regression file if necessary
|
// Write regression file if necessary
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// Write file for regression test
|
// Write file for regression test
|
||||||
sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f,
|
sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
|
||||||
false);
|
}
|
||||||
} else {
|
else {
|
||||||
// We need to reload the data from disk,
|
// We need to reload the data from disk,
|
||||||
// because it is inverted upon output
|
// because it is inverted upon output
|
||||||
sdkLoadPGM(outputFilename, &hOData, &width, &height);
|
sdkLoadPGM(outputFilename, &hOData, &width, &height);
|
||||||
@ -282,8 +278,7 @@ void runTest(int argc, char **argv) {
|
|||||||
printf("Comparing files\n");
|
printf("Comparing files\n");
|
||||||
printf("\toutput: <%s>\n", outputFilename);
|
printf("\toutput: <%s>\n", outputFilename);
|
||||||
printf("\treference: <%s>\n", refPath);
|
printf("\treference: <%s>\n", refPath);
|
||||||
testResult =
|
testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
|
||||||
compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
|
checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
|
||||||
|
|||||||
@ -68,10 +68,11 @@
|
|||||||
// this
|
// this
|
||||||
// struct by putting an undefined symbol in the function body so it won't
|
// struct by putting an undefined symbol in the function body so it won't
|
||||||
// compile.
|
// compile.
|
||||||
template <typename T>
|
template <typename T> struct SharedMemory
|
||||||
struct SharedMemory {
|
{
|
||||||
// Ensure that we won't compile any un-specialized types
|
// Ensure that we won't compile any un-specialized types
|
||||||
__device__ T *getPointer() {
|
__device__ T *getPointer()
|
||||||
|
{
|
||||||
extern __device__ void error(void);
|
extern __device__ void error(void);
|
||||||
error();
|
error();
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -82,89 +83,100 @@ struct SharedMemory {
|
|||||||
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
|
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
|
||||||
// One could also specialize it for user-defined types.
|
// One could also specialize it for user-defined types.
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<int>
|
||||||
struct SharedMemory<int> {
|
{
|
||||||
__device__ int *getPointer() {
|
__device__ int *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ int s_int[];
|
extern __shared__ int s_int[];
|
||||||
return s_int;
|
return s_int;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<unsigned int>
|
||||||
struct SharedMemory<unsigned int> {
|
{
|
||||||
__device__ unsigned int *getPointer() {
|
__device__ unsigned int *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ unsigned int s_uint[];
|
extern __shared__ unsigned int s_uint[];
|
||||||
return s_uint;
|
return s_uint;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<char>
|
||||||
struct SharedMemory<char> {
|
{
|
||||||
__device__ char *getPointer() {
|
__device__ char *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ char s_char[];
|
extern __shared__ char s_char[];
|
||||||
return s_char;
|
return s_char;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<unsigned char>
|
||||||
struct SharedMemory<unsigned char> {
|
{
|
||||||
__device__ unsigned char *getPointer() {
|
__device__ unsigned char *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ unsigned char s_uchar[];
|
extern __shared__ unsigned char s_uchar[];
|
||||||
return s_uchar;
|
return s_uchar;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<short>
|
||||||
struct SharedMemory<short> {
|
{
|
||||||
__device__ short *getPointer() {
|
__device__ short *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ short s_short[];
|
extern __shared__ short s_short[];
|
||||||
return s_short;
|
return s_short;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<unsigned short>
|
||||||
struct SharedMemory<unsigned short> {
|
{
|
||||||
__device__ unsigned short *getPointer() {
|
__device__ unsigned short *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ unsigned short s_ushort[];
|
extern __shared__ unsigned short s_ushort[];
|
||||||
return s_ushort;
|
return s_ushort;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<long>
|
||||||
struct SharedMemory<long> {
|
{
|
||||||
__device__ long *getPointer() {
|
__device__ long *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ long s_long[];
|
extern __shared__ long s_long[];
|
||||||
return s_long;
|
return s_long;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<unsigned long>
|
||||||
struct SharedMemory<unsigned long> {
|
{
|
||||||
__device__ unsigned long *getPointer() {
|
__device__ unsigned long *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ unsigned long s_ulong[];
|
extern __shared__ unsigned long s_ulong[];
|
||||||
return s_ulong;
|
return s_ulong;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<bool>
|
||||||
struct SharedMemory<bool> {
|
{
|
||||||
__device__ bool *getPointer() {
|
__device__ bool *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ bool s_bool[];
|
extern __shared__ bool s_bool[];
|
||||||
return s_bool;
|
return s_bool;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<float>
|
||||||
struct SharedMemory<float> {
|
{
|
||||||
__device__ float *getPointer() {
|
__device__ float *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ float s_float[];
|
extern __shared__ float s_float[];
|
||||||
return s_float;
|
return s_float;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <> struct SharedMemory<double>
|
||||||
struct SharedMemory<double> {
|
{
|
||||||
__device__ double *getPointer() {
|
__device__ double *getPointer()
|
||||||
|
{
|
||||||
extern __shared__ double s_double[];
|
extern __shared__ double s_double[];
|
||||||
return s_double;
|
return s_double;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -32,17 +32,17 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// System includes
|
// System includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// CUDA runtime
|
// CUDA runtime
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#ifndef MAX
|
#ifndef MAX
|
||||||
#define MAX(a, b) (a > b ? a : b)
|
#define MAX(a, b) (a > b ? a : b)
|
||||||
@ -58,8 +58,8 @@ int g_TotalFailures = 0;
|
|||||||
//! @param g_idata input data in global memory
|
//! @param g_idata input data in global memory
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class T>
|
template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
|
||||||
__global__ void testKernel(T *g_idata, T *g_odata) {
|
{
|
||||||
// Shared mem size is determined by the host app at run time
|
// Shared mem size is determined by the host app at run time
|
||||||
SharedMemory<T> smem;
|
SharedMemory<T> smem;
|
||||||
T *sdata = smem.getPointer();
|
T *sdata = smem.getPointer();
|
||||||
@ -83,11 +83,10 @@ __global__ void testKernel(T *g_idata, T *g_odata) {
|
|||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// declaration, forward
|
// declaration, forward
|
||||||
template <class T>
|
template <class T> void runTest(int argc, char **argv, int len);
|
||||||
void runTest(int argc, char **argv, int len);
|
|
||||||
|
|
||||||
template <class T>
|
template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
|
||||||
void computeGold(T *reference, T *idata, const unsigned int len) {
|
{
|
||||||
const T T_len = static_cast<T>(len);
|
const T T_len = static_cast<T>(len);
|
||||||
|
|
||||||
for (unsigned int i = 0; i < len; ++i) {
|
for (unsigned int i = 0; i < len; ++i) {
|
||||||
@ -98,7 +97,8 @@ void computeGold(T *reference, T *idata, const unsigned int len) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("> runTest<float,32>\n");
|
printf("> runTest<float,32>\n");
|
||||||
runTest<float>(argc, argv, 32);
|
runTest<float>(argc, argv, 32);
|
||||||
printf("> runTest<int,64>\n");
|
printf("> runTest<int,64>\n");
|
||||||
@ -114,60 +114,63 @@ int main(int argc, char **argv) {
|
|||||||
// functions for different types.
|
// functions for different types.
|
||||||
|
|
||||||
// Here's the generic wrapper for cutCompare*
|
// Here's the generic wrapper for cutCompare*
|
||||||
template <class T>
|
template <class T> class ArrayComparator
|
||||||
class ArrayComparator {
|
{
|
||||||
public:
|
public:
|
||||||
bool compare(const T *reference, T *data, unsigned int len) {
|
bool compare(const T *reference, T *data, unsigned int len)
|
||||||
fprintf(stderr,
|
{
|
||||||
"Error: no comparison function implemented for this type\n");
|
fprintf(stderr, "Error: no comparison function implemented for this type\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Here's the specialization for ints:
|
// Here's the specialization for ints:
|
||||||
template <>
|
template <> class ArrayComparator<int>
|
||||||
class ArrayComparator<int> {
|
{
|
||||||
public:
|
public:
|
||||||
bool compare(const int *reference, int *data, unsigned int len) {
|
bool compare(const int *reference, int *data, unsigned int len)
|
||||||
|
{
|
||||||
return compareData(reference, data, len, 0.15f, 0.0f);
|
return compareData(reference, data, len, 0.15f, 0.0f);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Here's the specialization for floats:
|
// Here's the specialization for floats:
|
||||||
template <>
|
template <> class ArrayComparator<float>
|
||||||
class ArrayComparator<float> {
|
{
|
||||||
public:
|
public:
|
||||||
bool compare(const float *reference, float *data, unsigned int len) {
|
bool compare(const float *reference, float *data, unsigned int len)
|
||||||
|
{
|
||||||
return compareData(reference, data, len, 0.15f, 0.15f);
|
return compareData(reference, data, len, 0.15f, 0.15f);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Here's the generic wrapper for cutWriteFile*
|
// Here's the generic wrapper for cutWriteFile*
|
||||||
template <class T>
|
template <class T> class ArrayFileWriter
|
||||||
class ArrayFileWriter {
|
{
|
||||||
public:
|
public:
|
||||||
bool write(const char *filename, T *data, unsigned int len, float epsilon) {
|
bool write(const char *filename, T *data, unsigned int len, float epsilon)
|
||||||
fprintf(stderr,
|
{
|
||||||
"Error: no file write function implemented for this type\n");
|
fprintf(stderr, "Error: no file write function implemented for this type\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Here's the specialization for ints:
|
// Here's the specialization for ints:
|
||||||
template <>
|
template <> class ArrayFileWriter<int>
|
||||||
class ArrayFileWriter<int> {
|
{
|
||||||
public:
|
public:
|
||||||
bool write(const char *filename, int *data, unsigned int len, float epsilon) {
|
bool write(const char *filename, int *data, unsigned int len, float epsilon)
|
||||||
|
{
|
||||||
return sdkWriteFile(filename, data, len, epsilon, false);
|
return sdkWriteFile(filename, data, len, epsilon, false);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Here's the specialization for floats:
|
// Here's the specialization for floats:
|
||||||
template <>
|
template <> class ArrayFileWriter<float>
|
||||||
class ArrayFileWriter<float> {
|
{
|
||||||
public:
|
public:
|
||||||
bool write(const char *filename, float *data, unsigned int len,
|
bool write(const char *filename, float *data, unsigned int len, float epsilon)
|
||||||
float epsilon) {
|
{
|
||||||
return sdkWriteFile(filename, data, len, epsilon, false);
|
return sdkWriteFile(filename, data, len, epsilon, false);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -175,8 +178,8 @@ class ArrayFileWriter<float> {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
template <class T>
|
template <class T> void runTest(int argc, char **argv, int len)
|
||||||
void runTest(int argc, char **argv, int len) {
|
{
|
||||||
int devID;
|
int devID;
|
||||||
cudaDeviceProp deviceProps;
|
cudaDeviceProp deviceProps;
|
||||||
|
|
||||||
@ -184,8 +187,7 @@ void runTest(int argc, char **argv, int len) {
|
|||||||
|
|
||||||
// get number of SMs on this GPU
|
// get number of SMs on this GPU
|
||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||||
printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name,
|
printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
|
||||||
deviceProps.multiProcessorCount);
|
|
||||||
|
|
||||||
// create and start timer
|
// create and start timer
|
||||||
StopWatchInterface *timer = NULL;
|
StopWatchInterface *timer = NULL;
|
||||||
@ -209,8 +211,7 @@ void runTest(int argc, char **argv, int len) {
|
|||||||
T *d_idata;
|
T *d_idata;
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
|
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
|
||||||
// copy host memory to device
|
// copy host memory to device
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
T *d_odata;
|
T *d_odata;
|
||||||
@ -229,8 +230,7 @@ void runTest(int argc, char **argv, int len) {
|
|||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
T *h_odata = (T *)malloc(mem_size);
|
T *h_odata = (T *)malloc(mem_size);
|
||||||
// copy result from device to host
|
// copy result from device to host
|
||||||
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads,
|
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
@ -247,7 +247,8 @@ void runTest(int argc, char **argv, int len) {
|
|||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// write file for regression test
|
// write file for regression test
|
||||||
writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
|
writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// custom output handling when no regression test running
|
// custom output handling when no regression test running
|
||||||
// in this case check if the result is equivalent to the expected solution
|
// in this case check if the result is equivalent to the expected solution
|
||||||
bool res = comparator.compare(reference, h_odata, num_threads);
|
bool res = comparator.compare(reference, h_odata, num_threads);
|
||||||
|
|||||||
@ -34,10 +34,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Includes, system
|
// Includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define WINDOWS_LEAN_AND_MEAN
|
#define WINDOWS_LEAN_AND_MEAN
|
||||||
@ -73,8 +73,8 @@ bool testResult = true;
|
|||||||
//! Transform an image using texture lookups
|
//! Transform an image using texture lookups
|
||||||
//! @param outputData output data in global memory
|
//! @param outputData output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void transformKernel(float *outputData, int width, int height,
|
__global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
|
||||||
float theta, cudaTextureObject_t tex) {
|
{
|
||||||
// calculate normalized texture coordinates
|
// calculate normalized texture coordinates
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
@ -98,23 +98,24 @@ void runTest(int argc, char **argv);
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("%s starting...\n", sampleName);
|
printf("%s starting...\n", sampleName);
|
||||||
|
|
||||||
// Process command-line arguments
|
// Process command-line arguments
|
||||||
if (argc > 1) {
|
if (argc > 1) {
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
|
||||||
getCmdLineArgumentString(argc, (const char **)argv, "input",
|
getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
|
||||||
(char **)&imageFilename);
|
|
||||||
|
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
||||||
getCmdLineArgumentString(argc, (const char **)argv, "reference",
|
getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
|
||||||
(char **)&refFilename);
|
}
|
||||||
} else {
|
else {
|
||||||
printf("-input flag should be used with -reference flag");
|
printf("-input flag should be used with -reference flag");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
} else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
}
|
||||||
|
else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
|
||||||
printf("-reference flag should be used with -input flag");
|
printf("-reference flag should be used with -input flag");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -122,15 +123,15 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
runTest(argc, argv);
|
runTest(argc, argv);
|
||||||
|
|
||||||
printf("%s completed, returned %s\n", sampleName,
|
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
int devID = findCudaDevice(argc, (const char **)argv);
|
int devID = findCudaDevice(argc, (const char **)argv);
|
||||||
|
|
||||||
// load image from disk
|
// load image from disk
|
||||||
@ -164,12 +165,10 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&dData, size));
|
checkCudaErrors(cudaMalloc((void **)&dData, size));
|
||||||
|
|
||||||
// Allocate array and copy image data
|
// Allocate array and copy image data
|
||||||
cudaChannelFormatDesc channelDesc =
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||||
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
|
||||||
cudaArray *cuArray;
|
cudaArray *cuArray;
|
||||||
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
|
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
cudaTextureObject_t tex;
|
cudaTextureObject_t tex;
|
||||||
cudaResourceDesc texRes;
|
cudaResourceDesc texRes;
|
||||||
@ -209,8 +208,7 @@ void runTest(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
printf("%.2f Mpixels/sec\n",
|
printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
|
||||||
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
|
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// Allocate mem for the result on host side
|
// Allocate mem for the result on host side
|
||||||
@ -228,9 +226,9 @@ void runTest(int argc, char **argv) {
|
|||||||
// Write regression file if necessary
|
// Write regression file if necessary
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// Write file for regression test
|
// Write file for regression test
|
||||||
sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height,
|
sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
|
||||||
0.0f, false);
|
}
|
||||||
} else {
|
else {
|
||||||
// We need to reload the data from disk,
|
// We need to reload the data from disk,
|
||||||
// because it is inverted upon output
|
// because it is inverted upon output
|
||||||
sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
|
sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
|
||||||
@ -239,8 +237,7 @@ void runTest(int argc, char **argv) {
|
|||||||
printf("\toutput: <%s>\n", outputFilename);
|
printf("\toutput: <%s>\n", outputFilename);
|
||||||
printf("\treference: <%s>\n", refPath);
|
printf("\treference: <%s>\n", refPath);
|
||||||
|
|
||||||
testResult = compareData(hOutputData, hDataRef, width * height,
|
testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
|
||||||
MAX_EPSILON_ERROR, 0.15f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaDestroyTextureObject(tex));
|
checkCudaErrors(cudaDestroyTextureObject(tex));
|
||||||
|
|||||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
|
|||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -32,11 +32,11 @@
|
|||||||
using 3D texture lookups.
|
using 3D texture lookups.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include <helper_gl.h>
|
#include <helper_gl.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
#if defined(__APPLE__) || defined(MACOSX)
|
#if defined(__APPLE__) || defined(MACOSX)
|
||||||
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
|
||||||
@ -49,9 +49,9 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// includes, cuda
|
// includes, cuda
|
||||||
#include <vector_types.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cuda_gl_interop.h>
|
#include <cuda_gl_interop.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <vector_types.h>
|
||||||
|
|
||||||
// CUDA utilities and system includes
|
// CUDA utilities and system includes
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
@ -76,8 +76,7 @@ const dim3 gridSize(width / blockSize.x, height / blockSize.y);
|
|||||||
float w = 0.5; // texture coordinate in z
|
float w = 0.5; // texture coordinate in z
|
||||||
|
|
||||||
GLuint pbo; // OpenGL pixel buffer object
|
GLuint pbo; // OpenGL pixel buffer object
|
||||||
struct cudaGraphicsResource
|
struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
|
||||||
*cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
|
|
||||||
|
|
||||||
bool linearFiltering = true;
|
bool linearFiltering = true;
|
||||||
bool animate = true;
|
bool animate = true;
|
||||||
@ -105,13 +104,13 @@ char **pArgv = NULL;
|
|||||||
extern "C" void cleanup();
|
extern "C" void cleanup();
|
||||||
extern "C" void setTextureFilterMode(bool bLinearFilter);
|
extern "C" void setTextureFilterMode(bool bLinearFilter);
|
||||||
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
|
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
|
||||||
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
|
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
|
||||||
uint imageW, uint imageH, float w);
|
|
||||||
extern void cleanupCuda();
|
extern void cleanupCuda();
|
||||||
|
|
||||||
void loadVolumeData(char *exec_path);
|
void loadVolumeData(char *exec_path);
|
||||||
|
|
||||||
void computeFPS() {
|
void computeFPS()
|
||||||
|
{
|
||||||
frameCount++;
|
frameCount++;
|
||||||
fpsCount++;
|
fpsCount++;
|
||||||
|
|
||||||
@ -129,13 +128,13 @@ void computeFPS() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// render image using CUDA
|
// render image using CUDA
|
||||||
void render() {
|
void render()
|
||||||
|
{
|
||||||
// map PBO to get CUDA device pointer
|
// map PBO to get CUDA device pointer
|
||||||
g_GraphicsMapFlag++;
|
g_GraphicsMapFlag++;
|
||||||
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
|
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
|
||||||
size_t num_bytes;
|
size_t num_bytes;
|
||||||
checkCudaErrors(cudaGraphicsResourceGetMappedPointer(
|
checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
|
||||||
(void **)&d_output, &num_bytes, cuda_pbo_resource));
|
|
||||||
// printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
|
// printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
|
||||||
|
|
||||||
// call CUDA kernel, writing results to PBO
|
// call CUDA kernel, writing results to PBO
|
||||||
@ -150,7 +149,8 @@ void render() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// display results using OpenGL (called by GLUT)
|
// display results using OpenGL (called by GLUT)
|
||||||
void display() {
|
void display()
|
||||||
|
{
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
render();
|
render();
|
||||||
@ -172,14 +172,16 @@ void display() {
|
|||||||
computeFPS();
|
computeFPS();
|
||||||
}
|
}
|
||||||
|
|
||||||
void idle() {
|
void idle()
|
||||||
|
{
|
||||||
if (animate) {
|
if (animate) {
|
||||||
w += 0.01f;
|
w += 0.01f;
|
||||||
glutPostRedisplay();
|
glutPostRedisplay();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void keyboard(unsigned char key, int x, int y) {
|
void keyboard(unsigned char key, int x, int y)
|
||||||
|
{
|
||||||
switch (key) {
|
switch (key) {
|
||||||
case 27:
|
case 27:
|
||||||
#if defined(__APPLE__) || defined(MACOSX)
|
#if defined(__APPLE__) || defined(MACOSX)
|
||||||
@ -216,7 +218,8 @@ void keyboard(unsigned char key, int x, int y) {
|
|||||||
glutPostRedisplay();
|
glutPostRedisplay();
|
||||||
}
|
}
|
||||||
|
|
||||||
void reshape(int x, int y) {
|
void reshape(int x, int y)
|
||||||
|
{
|
||||||
glViewport(0, 0, x, y);
|
glViewport(0, 0, x, y);
|
||||||
|
|
||||||
glMatrixMode(GL_MODELVIEW);
|
glMatrixMode(GL_MODELVIEW);
|
||||||
@ -227,7 +230,8 @@ void reshape(int x, int y) {
|
|||||||
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
|
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cleanup() {
|
void cleanup()
|
||||||
|
{
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// add extra check to unmap the resource before unregistering it
|
// add extra check to unmap the resource before unregistering it
|
||||||
@ -242,21 +246,21 @@ void cleanup() {
|
|||||||
cleanupCuda();
|
cleanupCuda();
|
||||||
}
|
}
|
||||||
|
|
||||||
void initGLBuffers() {
|
void initGLBuffers()
|
||||||
|
{
|
||||||
// create pixel buffer object
|
// create pixel buffer object
|
||||||
glGenBuffers(1, &pbo);
|
glGenBuffers(1, &pbo);
|
||||||
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
|
||||||
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4,
|
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
|
||||||
0, GL_STREAM_DRAW_ARB);
|
|
||||||
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
|
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
|
||||||
|
|
||||||
// register this buffer object with CUDA
|
// register this buffer object with CUDA
|
||||||
checkCudaErrors(cudaGraphicsGLRegisterBuffer(
|
checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
|
||||||
&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load raw data from disk
|
// Load raw data from disk
|
||||||
uchar *loadRawFile(const char *filename, size_t size) {
|
uchar *loadRawFile(const char *filename, size_t size)
|
||||||
|
{
|
||||||
FILE *fp = fopen(filename, "rb");
|
FILE *fp = fopen(filename, "rb");
|
||||||
|
|
||||||
if (!fp) {
|
if (!fp) {
|
||||||
@ -273,7 +277,8 @@ uchar *loadRawFile(const char *filename, size_t size) {
|
|||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initGL(int *argc, char **argv) {
|
void initGL(int *argc, char **argv)
|
||||||
|
{
|
||||||
// initialize GLUT callback functions
|
// initialize GLUT callback functions
|
||||||
glutInit(argc, argv);
|
glutInit(argc, argv);
|
||||||
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
|
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
|
||||||
@ -284,16 +289,15 @@ void initGL(int *argc, char **argv) {
|
|||||||
glutReshapeFunc(reshape);
|
glutReshapeFunc(reshape);
|
||||||
glutIdleFunc(idle);
|
glutIdleFunc(idle);
|
||||||
|
|
||||||
if (!isGLVersionSupported(2, 0) ||
|
if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
|
||||||
!areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
|
|
||||||
fprintf(stderr, "Required OpenGL extensions are missing.");
|
fprintf(stderr, "Required OpenGL extensions are missing.");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void runAutoTest(const char *ref_file, char *exec_path) {
|
void runAutoTest(const char *ref_file, char *exec_path)
|
||||||
checkCudaErrors(
|
{
|
||||||
cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
|
checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
|
||||||
|
|
||||||
// render the volumeData
|
// render the volumeData
|
||||||
render_kernel(gridSize, blockSize, d_output, width, height, w);
|
render_kernel(gridSize, blockSize, d_output, width, height, w);
|
||||||
@ -302,15 +306,15 @@ void runAutoTest(const char *ref_file, char *exec_path) {
|
|||||||
getLastCudaError("render_kernel failed");
|
getLastCudaError("render_kernel failed");
|
||||||
|
|
||||||
void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
|
void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
|
||||||
checkCudaErrors(cudaMemcpy(h_output, d_output,
|
checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
|
||||||
width * height * sizeof(GLubyte) * 4,
|
sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
|
|
||||||
"simpleTexture3D.bin");
|
|
||||||
|
|
||||||
bool bTestResult = sdkCompareBin2BinFloat(
|
bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
|
||||||
"simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path),
|
sdkFindFilePath(ref_file, exec_path),
|
||||||
width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path);
|
width * height,
|
||||||
|
MAX_EPSILON_ERROR,
|
||||||
|
THRESHOLD,
|
||||||
|
exec_path);
|
||||||
|
|
||||||
checkCudaErrors(cudaFree(d_output));
|
checkCudaErrors(cudaFree(d_output));
|
||||||
free(h_output);
|
free(h_output);
|
||||||
@ -321,13 +325,13 @@ void runAutoTest(const char *ref_file, char *exec_path) {
|
|||||||
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
void loadVolumeData(char *exec_path) {
|
void loadVolumeData(char *exec_path)
|
||||||
|
{
|
||||||
// load volume data
|
// load volume data
|
||||||
const char *path = sdkFindFilePath(volumeFilename, exec_path);
|
const char *path = sdkFindFilePath(volumeFilename, exec_path);
|
||||||
|
|
||||||
if (path == NULL) {
|
if (path == NULL) {
|
||||||
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n",
|
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
|
||||||
volumeFilename);
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -343,7 +347,8 @@ void loadVolumeData(char *exec_path) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
pArgc = &argc;
|
pArgc = &argc;
|
||||||
pArgv = argv;
|
pArgv = argv;
|
||||||
|
|
||||||
@ -367,7 +372,8 @@ int main(int argc, char **argv) {
|
|||||||
if (ref_file) {
|
if (ref_file) {
|
||||||
loadVolumeData(argv[0]);
|
loadVolumeData(argv[0]);
|
||||||
runAutoTest(ref_file, argv[0]);
|
runAutoTest(ref_file, argv[0]);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
initGL(&argc, argv);
|
initGL(&argc, argv);
|
||||||
|
|
||||||
// OpenGL buffers
|
// OpenGL buffers
|
||||||
@ -376,8 +382,7 @@ int main(int argc, char **argv) {
|
|||||||
loadVolumeData(argv[0]);
|
loadVolumeData(argv[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(
|
printf("Press space to toggle animation\n"
|
||||||
"Press space to toggle animation\n"
|
|
||||||
"Press '+' and '-' to change displayed slice\n");
|
"Press '+' and '-' to change displayed slice\n");
|
||||||
|
|
||||||
#if defined(__APPLE__) || defined(MACOSX)
|
#if defined(__APPLE__) || defined(MACOSX)
|
||||||
|
|||||||
@ -28,13 +28,12 @@
|
|||||||
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
|
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
|
||||||
#define _SIMPLETEXTURE3D_KERNEL_CU_
|
#define _SIMPLETEXTURE3D_KERNEL_CU_
|
||||||
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
|
||||||
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <helper_math.h>
|
#include <helper_math.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
typedef unsigned int uint;
|
typedef unsigned int uint;
|
||||||
typedef unsigned char uchar;
|
typedef unsigned char uchar;
|
||||||
@ -42,8 +41,8 @@ typedef unsigned char uchar;
|
|||||||
cudaArray *d_volumeArray = 0;
|
cudaArray *d_volumeArray = 0;
|
||||||
cudaTextureObject_t tex; // 3D texture
|
cudaTextureObject_t tex; // 3D texture
|
||||||
|
|
||||||
__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
|
__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
|
||||||
cudaTextureObject_t texObj) {
|
{
|
||||||
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
|
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
|
||||||
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
|
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
|
||||||
|
|
||||||
@ -59,7 +58,8 @@ __global__ void d_render(uint *d_output, uint imageW, uint imageH, float w,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void setTextureFilterMode(bool bLinearFilter) {
|
extern "C" void setTextureFilterMode(bool bLinearFilter)
|
||||||
|
{
|
||||||
if (tex) {
|
if (tex) {
|
||||||
checkCudaErrors(cudaDestroyTextureObject(tex));
|
checkCudaErrors(cudaDestroyTextureObject(tex));
|
||||||
}
|
}
|
||||||
@ -73,8 +73,7 @@ extern "C" void setTextureFilterMode(bool bLinearFilter) {
|
|||||||
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
memset(&texDescr, 0, sizeof(cudaTextureDesc));
|
||||||
|
|
||||||
texDescr.normalizedCoords = true;
|
texDescr.normalizedCoords = true;
|
||||||
texDescr.filterMode =
|
texDescr.filterMode = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
|
||||||
bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
|
|
||||||
;
|
;
|
||||||
texDescr.addressMode[0] = cudaAddressModeWrap;
|
texDescr.addressMode[0] = cudaAddressModeWrap;
|
||||||
texDescr.addressMode[1] = cudaAddressModeWrap;
|
texDescr.addressMode[1] = cudaAddressModeWrap;
|
||||||
@ -84,7 +83,8 @@ extern "C" void setTextureFilterMode(bool bLinearFilter) {
|
|||||||
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
|
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
|
||||||
|
{
|
||||||
// create 3D array
|
// create 3D array
|
||||||
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
|
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
|
||||||
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
|
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
|
||||||
@ -92,8 +92,7 @@ extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
|
|||||||
// copy data to 3D array
|
// copy data to 3D array
|
||||||
cudaMemcpy3DParms copyParams = {0};
|
cudaMemcpy3DParms copyParams = {0};
|
||||||
copyParams.srcPtr =
|
copyParams.srcPtr =
|
||||||
make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar),
|
make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
|
||||||
volumeSize.width, volumeSize.height);
|
|
||||||
copyParams.dstArray = d_volumeArray;
|
copyParams.dstArray = d_volumeArray;
|
||||||
copyParams.extent = volumeSize;
|
copyParams.extent = volumeSize;
|
||||||
copyParams.kind = cudaMemcpyHostToDevice;
|
copyParams.kind = cudaMemcpyHostToDevice;
|
||||||
@ -121,12 +120,13 @@ extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) {
|
|||||||
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output,
|
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
|
||||||
uint imageW, uint imageH, float w) {
|
{
|
||||||
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
|
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cleanupCuda() {
|
void cleanupCuda()
|
||||||
|
{
|
||||||
if (tex) {
|
if (tex) {
|
||||||
checkCudaErrors(cudaDestroyTextureObject(tex));
|
checkCudaErrors(cudaDestroyTextureObject(tex));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,16 +39,16 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, CUDA
|
// includes, CUDA
|
||||||
#include <cuda.h>
|
|
||||||
#include <builtin_types.h>
|
#include <builtin_types.h>
|
||||||
|
#include <cuda.h>
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda_drvapi.h>
|
#include <helper_cuda_drvapi.h>
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
@ -65,8 +65,7 @@ float angle = 0.5f; // angle to rotate image by (in radians)
|
|||||||
// declaration, forward
|
// declaration, forward
|
||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
|
|
||||||
extern "C" void computeGold(float *reference, float *idata,
|
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
|
||||||
const unsigned int len);
|
|
||||||
|
|
||||||
static CUresult initCUDA(int argc, char **argv, CUfunction *);
|
static CUresult initCUDA(int argc, char **argv, CUfunction *);
|
||||||
|
|
||||||
@ -84,7 +83,8 @@ CUdevice cuDevice;
|
|||||||
CUcontext cuContext;
|
CUcontext cuContext;
|
||||||
CUmodule cuModule;
|
CUmodule cuModule;
|
||||||
|
|
||||||
void showHelp() {
|
void showHelp()
|
||||||
|
{
|
||||||
printf("\n> [%s] Command line options\n", sSDKsample);
|
printf("\n> [%s] Command line options\n", sSDKsample);
|
||||||
printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n");
|
printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n");
|
||||||
}
|
}
|
||||||
@ -92,7 +92,8 @@ void showHelp() {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
|
||||||
showHelp();
|
showHelp();
|
||||||
return 0;
|
return 0;
|
||||||
@ -104,7 +105,8 @@ int main(int argc, char **argv) {
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
bool bTestResults = true;
|
bool bTestResults = true;
|
||||||
|
|
||||||
// initialize CUDA
|
// initialize CUDA
|
||||||
@ -191,18 +193,17 @@ void runTest(int argc, char **argv) {
|
|||||||
// Launching (simpler method)
|
// Launching (simpler method)
|
||||||
void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
|
void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
|
||||||
|
|
||||||
checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
|
checkCudaErrors(cuLaunchKernel(
|
||||||
(height / block_size), 1, block_size,
|
transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
|
||||||
block_size, 1, 0, NULL, args, NULL));
|
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
sdkCreateTimer(&timer);
|
sdkCreateTimer(&timer);
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
// launch kernel again for performance measurement
|
// launch kernel again for performance measurement
|
||||||
checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
|
checkCudaErrors(cuLaunchKernel(
|
||||||
(height / block_size), 1, block_size,
|
transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
|
||||||
block_size, 1, 0, NULL, args, NULL));
|
}
|
||||||
} else {
|
else {
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||||
// Launching (advanced method)
|
// Launching (advanced method)
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
@ -222,29 +223,43 @@ void runTest(int argc, char **argv) {
|
|||||||
*((CUtexObject *)&argBuffer[offset]) = TexObject;
|
*((CUtexObject *)&argBuffer[offset]) = TexObject;
|
||||||
offset += sizeof(TexObject);
|
offset += sizeof(TexObject);
|
||||||
|
|
||||||
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
|
void *kernel_launch_config[5] = {
|
||||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
|
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
|
||||||
CU_LAUNCH_PARAM_END};
|
|
||||||
|
|
||||||
// new CUDA 4.0 Driver API Kernel launch call (warmup)
|
// new CUDA 4.0 Driver API Kernel launch call (warmup)
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(transform,
|
||||||
transform, (width / block_size), (height / block_size), 1, block_size,
|
(width / block_size),
|
||||||
block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config));
|
(height / block_size),
|
||||||
|
1,
|
||||||
|
block_size,
|
||||||
|
block_size,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
(void **)&kernel_launch_config));
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
sdkCreateTimer(&timer);
|
sdkCreateTimer(&timer);
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
|
|
||||||
// launch kernel again for performance measurement
|
// launch kernel again for performance measurement
|
||||||
checkCudaErrors(cuLaunchKernel(
|
checkCudaErrors(cuLaunchKernel(transform,
|
||||||
transform, (width / block_size), (height / block_size), 1, block_size,
|
(width / block_size),
|
||||||
block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config));
|
(height / block_size),
|
||||||
|
1,
|
||||||
|
block_size,
|
||||||
|
block_size,
|
||||||
|
1,
|
||||||
|
0,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
(void **)&kernel_launch_config));
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
printf("%.2f Mpixels/sec\n",
|
printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
|
||||||
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
|
|
||||||
sdkDeleteTimer(&timer);
|
sdkDeleteTimer(&timer);
|
||||||
|
|
||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
@ -262,17 +277,16 @@ void runTest(int argc, char **argv) {
|
|||||||
// write regression file if necessary
|
// write regression file if necessary
|
||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// write file for regression test
|
// write file for regression test
|
||||||
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
|
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
|
||||||
false);
|
}
|
||||||
} else {
|
else {
|
||||||
// We need to reload the data from disk, because it is inverted upon output
|
// We need to reload the data from disk, because it is inverted upon output
|
||||||
sdkLoadPGM(output_filename, &h_odata, &width, &height);
|
sdkLoadPGM(output_filename, &h_odata, &width, &height);
|
||||||
|
|
||||||
printf("Comparing files\n");
|
printf("Comparing files\n");
|
||||||
printf("\toutput: <%s>\n", output_filename);
|
printf("\toutput: <%s>\n", output_filename);
|
||||||
printf("\treference: <%s>\n", ref_path);
|
printf("\treference: <%s>\n", ref_path);
|
||||||
bTestResults = compareData(h_odata, h_data_ref, width * height,
|
bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
|
||||||
MIN_EPSILON_ERROR, 0.15f);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// cleanup memory
|
// cleanup memory
|
||||||
@ -293,7 +307,8 @@ void runTest(int argc, char **argv) {
|
|||||||
//! kernel function. After the module is loaded, cuModuleGetFunction
|
//! kernel function. After the module is loaded, cuModuleGetFunction
|
||||||
//! retrieves the CUDA function pointer "cuFunction"
|
//! retrieves the CUDA function pointer "cuFunction"
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
|
static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
|
||||||
|
{
|
||||||
CUfunction cuFunction = 0;
|
CUfunction cuFunction = 0;
|
||||||
int major = 0, minor = 0, devID = 0;
|
int major = 0, minor = 0, devID = 0;
|
||||||
char deviceName[100];
|
char deviceName[100];
|
||||||
@ -302,10 +317,8 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
|
|||||||
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
||||||
|
|
||||||
// get compute capabilities and the devicename
|
// get compute capabilities and the devicename
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
||||||
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
||||||
|
|
||||||
@ -316,7 +329,8 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
|
|||||||
|
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -328,8 +342,7 @@ static CUresult initCUDA(int argc, char **argv, CUfunction *transform) {
|
|||||||
// Create module from binary file (FATBIN)
|
// Create module from binary file (FATBIN)
|
||||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||||
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
|
||||||
cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
|
|
||||||
|
|
||||||
*transform = cuFunction;
|
*transform = cuFunction;
|
||||||
|
|
||||||
|
|||||||
@ -33,9 +33,8 @@
|
|||||||
//! Transform an image using texture lookups
|
//! Transform an image using texture lookups
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
extern "C" __global__ void transformKernel(float *g_odata, int width,
|
extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
|
||||||
int height, float theta,
|
{
|
||||||
CUtexObject tex) {
|
|
||||||
// calculate normalized texture coordinates
|
// calculate normalized texture coordinates
|
||||||
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
|||||||
@ -53,7 +53,8 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
|
|||||||
#include "simpleVote_kernel.cuh"
|
#include "simpleVote_kernel.cuh"
|
||||||
|
|
||||||
// Generate the test pattern for Tests 1 and 2
|
// Generate the test pattern for Tests 1 and 2
|
||||||
void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
|
void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
|
||||||
|
{
|
||||||
// For testing VOTE.Any (all of these threads will return 0)
|
// For testing VOTE.Any (all of these threads will return 0)
|
||||||
for (int i = 0; i < size / 4; i++) {
|
for (int i = 0; i < size / 4; i++) {
|
||||||
VOTE_PATTERN[i] = 0x00000000;
|
VOTE_PATTERN[i] = 0x00000000;
|
||||||
@ -75,8 +76,8 @@ void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
|
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
|
||||||
const char *voteType) {
|
{
|
||||||
int i, sum = 0;
|
int i, sum = 0;
|
||||||
|
|
||||||
for (sum = 0, i = start; i < end; i++) {
|
for (sum = 0, i = start; i < end; i++) {
|
||||||
@ -96,8 +97,8 @@ int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
|
|||||||
return (sum > 0);
|
return (sum > 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
|
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
|
||||||
const char *voteType) {
|
{
|
||||||
int i, sum = 0;
|
int i, sum = 0;
|
||||||
|
|
||||||
for (sum = 0, i = start; i < end; i++) {
|
for (sum = 0, i = start; i < end; i++) {
|
||||||
@ -118,49 +119,42 @@ int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Verification code for Kernel #1
|
// Verification code for Kernel #1
|
||||||
int checkResultsVoteAnyKernel1(unsigned int *h_result, int size,
|
int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
|
||||||
int warp_size) {
|
{
|
||||||
int error_count = 0;
|
int error_count = 0;
|
||||||
|
|
||||||
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
|
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
||||||
warp_size, "Vote.Any");
|
error_count += checkErrors2(
|
||||||
error_count +=
|
h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
||||||
checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4,
|
error_count += checkErrors2(
|
||||||
2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
||||||
error_count +=
|
error_count += checkErrors2(
|
||||||
checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
|
h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
||||||
3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
|
||||||
error_count +=
|
|
||||||
checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
|
|
||||||
4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
|
|
||||||
|
|
||||||
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
|
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
|
||||||
return error_count;
|
return error_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verification code for Kernel #2
|
// Verification code for Kernel #2
|
||||||
int checkResultsVoteAllKernel2(unsigned int *h_result, int size,
|
int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
|
||||||
int warp_size) {
|
{
|
||||||
int error_count = 0;
|
int error_count = 0;
|
||||||
|
|
||||||
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4,
|
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
||||||
warp_size, "Vote.All");
|
error_count += checkErrors1(
|
||||||
error_count +=
|
h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
||||||
checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4,
|
error_count += checkErrors1(
|
||||||
2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
||||||
error_count +=
|
error_count += checkErrors2(
|
||||||
checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4,
|
h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
||||||
3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
|
||||||
error_count +=
|
|
||||||
checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
|
|
||||||
4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
|
|
||||||
|
|
||||||
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
|
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
|
||||||
return error_count;
|
return error_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Verification code for Kernel #3
|
// Verification code for Kernel #3
|
||||||
int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
|
int checkResultsVoteAnyKernel3(bool *hinfo, int size)
|
||||||
|
{
|
||||||
int i, error_count = 0;
|
int i, error_count = 0;
|
||||||
|
|
||||||
for (i = 0; i < size * 3; i++) {
|
for (i = 0; i < size * 3; i++) {
|
||||||
@ -198,7 +192,8 @@ int checkResultsVoteAnyKernel3(bool *hinfo, int size) {
|
|||||||
return error_count;
|
return error_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
unsigned int *h_input, *h_result;
|
unsigned int *h_input, *h_result;
|
||||||
unsigned int *d_input, *d_result;
|
unsigned int *d_input, *d_result;
|
||||||
|
|
||||||
@ -216,24 +211,20 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
|
||||||
|
|
||||||
// Statistics about the GPU device
|
// Statistics about the GPU device
|
||||||
printf(
|
printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
|
||||||
"> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
|
deviceProp.multiProcessorCount,
|
||||||
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
|
deviceProp.major,
|
||||||
|
deviceProp.minor);
|
||||||
|
|
||||||
h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
|
h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
|
||||||
sizeof(unsigned int));
|
h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
|
||||||
h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size *
|
|
||||||
sizeof(unsigned int));
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cudaMalloc(reinterpret_cast<void **>(&d_input),
|
cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
|
||||||
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
|
|
||||||
checkCudaErrors(
|
checkCudaErrors(
|
||||||
cudaMalloc(reinterpret_cast<void **>(&d_result),
|
cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
|
||||||
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
|
|
||||||
genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
|
genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
|
||||||
checkCudaErrors(cudaMemcpy(d_input, h_input,
|
checkCudaErrors(
|
||||||
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
|
cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));
|
||||||
cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
// Start of Vote Any Test Kernel #1
|
// Start of Vote Any Test Kernel #1
|
||||||
printf("[VOTE Kernel Test 1/3]\n");
|
printf("[VOTE Kernel Test 1/3]\n");
|
||||||
@ -242,16 +233,13 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
dim3 gridBlock(1, 1);
|
dim3 gridBlock(1, 1);
|
||||||
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
|
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
|
||||||
VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result,
|
VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
|
||||||
VOTE_DATA_GROUP * warp_size);
|
|
||||||
getLastCudaError("VoteAnyKernel() execution failed\n");
|
getLastCudaError("VoteAnyKernel() execution failed\n");
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
checkCudaErrors(cudaMemcpy(h_result, d_result,
|
checkCudaErrors(
|
||||||
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
|
cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
|
||||||
error_count[0] += checkResultsVoteAnyKernel1(
|
|
||||||
h_result, VOTE_DATA_GROUP * warp_size, warp_size);
|
|
||||||
|
|
||||||
// Start of Vote All Test Kernel #2
|
// Start of Vote All Test Kernel #2
|
||||||
printf("\n[VOTE Kernel Test 2/3]\n");
|
printf("\n[VOTE Kernel Test 2/3]\n");
|
||||||
@ -260,23 +248,18 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
dim3 gridBlock(1, 1);
|
dim3 gridBlock(1, 1);
|
||||||
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
|
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
|
||||||
VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result,
|
VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
|
||||||
VOTE_DATA_GROUP * warp_size);
|
|
||||||
getLastCudaError("VoteAllKernel() execution failed\n");
|
getLastCudaError("VoteAllKernel() execution failed\n");
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
checkCudaErrors(cudaMemcpy(h_result, d_result,
|
checkCudaErrors(
|
||||||
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
|
cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
|
||||||
error_count[1] += checkResultsVoteAllKernel2(
|
|
||||||
h_result, VOTE_DATA_GROUP * warp_size, warp_size);
|
|
||||||
|
|
||||||
// Second Vote Kernel Test #3 (both Any/All)
|
// Second Vote Kernel Test #3 (both Any/All)
|
||||||
hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
|
hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
|
||||||
cudaMalloc(reinterpret_cast<void **>(&dinfo),
|
cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
|
||||||
warp_size * 3 * 3 * sizeof(bool));
|
cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);
|
||||||
cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
|
|
||||||
cudaMemcpyHostToDevice);
|
|
||||||
|
|
||||||
printf("\n[VOTE Kernel Test 3/3]\n");
|
printf("\n[VOTE Kernel Test 3/3]\n");
|
||||||
printf("\tRunning <<Vote.Any>> kernel3 ...\n");
|
printf("\tRunning <<Vote.Any>> kernel3 ...\n");
|
||||||
@ -286,8 +269,7 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool),
|
cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);
|
||||||
cudaMemcpyDeviceToHost);
|
|
||||||
|
|
||||||
error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
|
error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
|
||||||
|
|
||||||
@ -303,7 +285,5 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
printf("\tShutting down...\n");
|
printf("\tShutting down...\n");
|
||||||
|
|
||||||
return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0)
|
return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||||
? EXIT_SUCCESS
|
|
||||||
: EXIT_FAILURE;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -38,8 +38,8 @@
|
|||||||
// If ANY one of the threads (within the warp) of the predicated condition
|
// If ANY one of the threads (within the warp) of the predicated condition
|
||||||
// returns a non-zero value, then all threads within this warp will return a
|
// returns a non-zero value, then all threads within this warp will return a
|
||||||
// non-zero value
|
// non-zero value
|
||||||
__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
|
__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
|
||||||
int size) {
|
{
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
|
|
||||||
int mask = 0xffffffff;
|
int mask = 0xffffffff;
|
||||||
@ -50,8 +50,8 @@ __global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result,
|
|||||||
// If ALL of the threads (within the warp) of the predicated condition returns
|
// If ALL of the threads (within the warp) of the predicated condition returns
|
||||||
// a non-zero value, then all threads within this warp will return a non-zero
|
// a non-zero value, then all threads within this warp will return a non-zero
|
||||||
// value
|
// value
|
||||||
__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
|
__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
|
||||||
int size) {
|
{
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
|
|
||||||
int mask = 0xffffffff;
|
int mask = 0xffffffff;
|
||||||
@ -60,7 +60,8 @@ __global__ void VoteAllKernel2(unsigned int *input, unsigned int *result,
|
|||||||
|
|
||||||
// Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
|
// Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
|
||||||
// This kernel will test for conditions across warps, and within half warps
|
// This kernel will test for conditions across warps, and within half warps
|
||||||
__global__ void VoteAnyKernel3(bool *info, int warp_size) {
|
__global__ void VoteAnyKernel3(bool *info, int warp_size)
|
||||||
|
{
|
||||||
int tx = threadIdx.x;
|
int tx = threadIdx.x;
|
||||||
unsigned int mask = 0xffffffff;
|
unsigned int mask = 0xffffffff;
|
||||||
bool *offs = info + (tx * 3);
|
bool *offs = info + (tx * 3);
|
||||||
|
|||||||
@ -41,7 +41,8 @@
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Add two vectors on the GPU */
|
/* Add two vectors on the GPU */
|
||||||
__global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
|
__global__ void vectorAddGPU(float *a, float *b, float *c, int N)
|
||||||
|
{
|
||||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
if (idx < N) {
|
if (idx < N) {
|
||||||
@ -57,7 +58,8 @@ bool bPinGenericMemory = false;
|
|||||||
#define MEMORY_ALIGNMENT 4096
|
#define MEMORY_ALIGNMENT 4096
|
||||||
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
|
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int n, nelem, deviceCount;
|
int n, nelem, deviceCount;
|
||||||
int idev = 0; // use default device 0
|
int idev = 0; // use default device 0
|
||||||
char *device = NULL;
|
char *device = NULL;
|
||||||
@ -73,8 +75,7 @@ int main(int argc, char **argv) {
|
|||||||
printf("Usage: simpleZeroCopy [OPTION]\n\n");
|
printf("Usage: simpleZeroCopy [OPTION]\n\n");
|
||||||
printf("Options:\n");
|
printf("Options:\n");
|
||||||
printf(" --device=[device #] Specify the device to be used\n");
|
printf(" --device=[device #] Specify the device to be used\n");
|
||||||
printf(
|
printf(" --use_generic_memory (optional) use generic page-aligned for system "
|
||||||
" --use_generic_memory (optional) use generic page-aligned for system "
|
|
||||||
"memory\n");
|
"memory\n");
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
@ -85,9 +86,7 @@ int main(int argc, char **argv) {
|
|||||||
idev = atoi(device);
|
idev = atoi(device);
|
||||||
|
|
||||||
if (idev >= deviceCount || idev < 0) {
|
if (idev >= deviceCount || idev < 0) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
|
||||||
"Device number %d is invalid, will use default CUDA device 0.\n",
|
|
||||||
idev);
|
|
||||||
idev = 0;
|
idev = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,7 +107,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if (bPinGenericMemory) {
|
if (bPinGenericMemory) {
|
||||||
printf("> Using Generic System Paged Memory (malloc)\n");
|
printf("> Using Generic System Paged Memory (malloc)\n");
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
|
printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -122,8 +122,7 @@ int main(int argc, char **argv) {
|
|||||||
#if CUDART_VERSION >= 2020
|
#if CUDART_VERSION >= 2020
|
||||||
|
|
||||||
if (!deviceProp.canMapHostMemory) {
|
if (!deviceProp.canMapHostMemory) {
|
||||||
fprintf(stderr, "Device %d does not support mapping CPU host memory!\n",
|
fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
|
||||||
idev);
|
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@ -133,7 +132,9 @@ int main(int argc, char **argv) {
|
|||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
"CUDART version %d.%d does not support "
|
"CUDART version %d.%d does not support "
|
||||||
"<cudaDeviceProp.canMapHostMemory> field\n",
|
"<cudaDeviceProp.canMapHostMemory> field\n",
|
||||||
, CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
|
,
|
||||||
|
CUDART_VERSION / 1000,
|
||||||
|
(CUDART_VERSION % 100) / 10);
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
#endif
|
#endif
|
||||||
@ -141,10 +142,10 @@ int main(int argc, char **argv) {
|
|||||||
#if CUDART_VERSION < 4000
|
#if CUDART_VERSION < 4000
|
||||||
|
|
||||||
if (bPinGenericMemory) {
|
if (bPinGenericMemory) {
|
||||||
fprintf(
|
fprintf(stderr,
|
||||||
stderr,
|
|
||||||
"CUDART version %d.%d does not support <cudaHostRegister> function\n",
|
"CUDART version %d.%d does not support <cudaHostRegister> function\n",
|
||||||
CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
|
CUDART_VERSION / 1000,
|
||||||
|
(CUDART_VERSION % 100) / 10);
|
||||||
|
|
||||||
exit(EXIT_SUCCESS);
|
exit(EXIT_SUCCESS);
|
||||||
}
|
}
|
||||||
@ -172,7 +173,8 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
|
checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
|
||||||
checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
|
checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
|
||||||
#endif
|
#endif
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
#if CUDART_VERSION >= 2020
|
#if CUDART_VERSION >= 2020
|
||||||
flags = cudaHostAllocMapped;
|
flags = cudaHostAllocMapped;
|
||||||
checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
|
checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
|
||||||
@ -235,7 +237,8 @@ int main(int argc, char **argv) {
|
|||||||
free(b_UA);
|
free(b_UA);
|
||||||
free(c_UA);
|
free(c_UA);
|
||||||
#endif
|
#endif
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
#if CUDART_VERSION >= 2020
|
#if CUDART_VERSION >= 2020
|
||||||
checkCudaErrors(cudaFreeHost(a));
|
checkCudaErrors(cudaFreeHost(a));
|
||||||
checkCudaErrors(cudaFreeHost(b));
|
checkCudaErrors(cudaFreeHost(b));
|
||||||
|
|||||||
@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
|
|||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -29,19 +29,20 @@
|
|||||||
* memory.
|
* memory.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <ctime>
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <cstdio>
|
|
||||||
#include <ctime>
|
|
||||||
|
|
||||||
#define min(a, b) (a) < (b) ? (a) : (b)
|
#define min(a, b) (a) < (b) ? (a) : (b)
|
||||||
#define max(a, b) (a) > (b) ? (a) : (b)
|
#define max(a, b) (a) > (b) ? (a) : (b)
|
||||||
|
|
||||||
#define LOOP_NUM 50
|
#define LOOP_NUM 50
|
||||||
|
|
||||||
__global__ void atomicKernel(int *atom_arr) {
|
__global__ void atomicKernel(int *atom_arr)
|
||||||
|
{
|
||||||
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
for (int i = 0; i < LOOP_NUM; i++) {
|
for (int i = 0; i < LOOP_NUM; i++) {
|
||||||
@ -79,7 +80,8 @@ __global__ void atomicKernel(int *atom_arr) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
|
void atomicKernel_CPU(int *atom_arr, int no_of_threads)
|
||||||
|
{
|
||||||
for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
|
for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
|
||||||
for (int j = 0; j < LOOP_NUM; j++) {
|
for (int j = 0; j < LOOP_NUM; j++) {
|
||||||
// Atomic addition
|
// Atomic addition
|
||||||
@ -92,23 +94,20 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
|
|||||||
int old, expected;
|
int old, expected;
|
||||||
do {
|
do {
|
||||||
expected = atom_arr[2];
|
expected = atom_arr[2];
|
||||||
old = __sync_val_compare_and_swap(&atom_arr[2], expected,
|
old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
|
||||||
max(expected, i));
|
|
||||||
} while (old != expected);
|
} while (old != expected);
|
||||||
|
|
||||||
// Atomic minimum
|
// Atomic minimum
|
||||||
do {
|
do {
|
||||||
expected = atom_arr[3];
|
expected = atom_arr[3];
|
||||||
old = __sync_val_compare_and_swap(&atom_arr[3], expected,
|
old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
|
||||||
min(expected, i));
|
|
||||||
} while (old != expected);
|
} while (old != expected);
|
||||||
|
|
||||||
// Atomic increment (modulo 17+1)
|
// Atomic increment (modulo 17+1)
|
||||||
int limit = 17;
|
int limit = 17;
|
||||||
do {
|
do {
|
||||||
expected = atom_arr[4];
|
expected = atom_arr[4];
|
||||||
old = __sync_val_compare_and_swap(
|
old = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
|
||||||
&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
|
|
||||||
} while (old != expected);
|
} while (old != expected);
|
||||||
|
|
||||||
// Atomic decrement
|
// Atomic decrement
|
||||||
@ -116,8 +115,7 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
|
|||||||
do {
|
do {
|
||||||
expected = atom_arr[5];
|
expected = atom_arr[5];
|
||||||
old = __sync_val_compare_and_swap(
|
old = __sync_val_compare_and_swap(
|
||||||
&atom_arr[5], expected,
|
&atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
|
||||||
((expected == 0) || (expected > limit)) ? limit : expected - 1);
|
|
||||||
} while (old != expected);
|
} while (old != expected);
|
||||||
|
|
||||||
// Atomic compare-and-swap
|
// Atomic compare-and-swap
|
||||||
@ -145,7 +143,8 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
|
|||||||
//! @param idata input data as provided to device
|
//! @param idata input data as provided to device
|
||||||
//! @param len number of elements in reference / idata
|
//! @param len number of elements in reference / idata
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int verify(int *testData, const int len) {
|
int verify(int *testData, const int len)
|
||||||
|
{
|
||||||
int val = 0;
|
int val = 0;
|
||||||
|
|
||||||
for (int i = 0; i < len * LOOP_NUM; ++i) {
|
for (int i = 0; i < len * LOOP_NUM; ++i) {
|
||||||
@ -275,7 +274,8 @@ int verify(int *testData, const int len) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
// set device
|
// set device
|
||||||
cudaDeviceProp device_prop;
|
cudaDeviceProp device_prop;
|
||||||
int dev_id = findCudaDevice(argc, (const char **)argv);
|
int dev_id = findCudaDevice(argc, (const char **)argv);
|
||||||
@ -296,8 +296,7 @@ int main(int argc, char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (device_prop.major < 6) {
|
if (device_prop.major < 6) {
|
||||||
printf(
|
printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
|
||||||
"%s: requires a minimum CUDA compute 6.0 capability, waiving "
|
|
||||||
"testing.\n",
|
"testing.\n",
|
||||||
argv[0]);
|
argv[0]);
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -312,12 +311,14 @@ int main(int argc, char **argv) {
|
|||||||
if (device_prop.pageableMemoryAccess) {
|
if (device_prop.pageableMemoryAccess) {
|
||||||
printf("CAN access pageable memory\n");
|
printf("CAN access pageable memory\n");
|
||||||
atom_arr = (int *)malloc(sizeof(int) * numData);
|
atom_arr = (int *)malloc(sizeof(int) * numData);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("CANNOT access pageable memory\n");
|
printf("CANNOT access pageable memory\n");
|
||||||
checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
|
checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0;
|
for (unsigned int i = 0; i < numData; i++)
|
||||||
|
atom_arr[i] = 0;
|
||||||
|
|
||||||
// To make the AND and XOR tests generate something other than 0...
|
// To make the AND and XOR tests generate something other than 0...
|
||||||
atom_arr[7] = atom_arr[9] = 0xff;
|
atom_arr[7] = atom_arr[9] = 0xff;
|
||||||
@ -332,11 +333,11 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if (device_prop.pageableMemoryAccess) {
|
if (device_prop.pageableMemoryAccess) {
|
||||||
free(atom_arr);
|
free(atom_arr);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
cudaFree(atom_arr);
|
cudaFree(atom_arr);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("systemWideAtomics completed, returned %s \n",
|
printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
|
||||||
testResult ? "OK" : "ERROR!");
|
|
||||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -31,10 +31,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes CUDA
|
// includes CUDA
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
@ -47,15 +47,15 @@
|
|||||||
// declaration, forward
|
// declaration, forward
|
||||||
void runTest(int argc, char **argv);
|
void runTest(int argc, char **argv);
|
||||||
|
|
||||||
extern "C" void computeGold(float *reference, float *idata,
|
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
|
||||||
const unsigned int len);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Simple test kernel for device functionality
|
//! Simple test kernel for device functionality
|
||||||
//! @param g_idata input data in global memory
|
//! @param g_idata input data in global memory
|
||||||
//! @param g_odata output data in global memory
|
//! @param g_odata output data in global memory
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
__global__ void testKernel(float *g_idata, float *g_odata) {
|
__global__ void testKernel(float *g_idata, float *g_odata)
|
||||||
|
{
|
||||||
// shared memory
|
// shared memory
|
||||||
// the size is determined by the host application
|
// the size is determined by the host application
|
||||||
extern __shared__ float sdata[];
|
extern __shared__ float sdata[];
|
||||||
@ -85,7 +85,8 @@ int main(int argc, char **argv) { runTest(argc, argv); }
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Run a simple test for CUDA
|
//! Run a simple test for CUDA
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void runTest(int argc, char **argv) {
|
void runTest(int argc, char **argv)
|
||||||
|
{
|
||||||
bool bTestResult = true;
|
bool bTestResult = true;
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
@ -113,8 +114,7 @@ void runTest(int argc, char **argv) {
|
|||||||
float *d_idata;
|
float *d_idata;
|
||||||
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
|
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
|
||||||
// copy host memory to device
|
// copy host memory to device
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
// allocate device memory for result
|
// allocate device memory for result
|
||||||
float *d_odata;
|
float *d_odata;
|
||||||
@ -133,8 +133,7 @@ void runTest(int argc, char **argv) {
|
|||||||
// allocate mem for the result on host side
|
// allocate mem for the result on host side
|
||||||
float *h_odata = (float *)malloc(mem_size);
|
float *h_odata = (float *)malloc(mem_size);
|
||||||
// copy result from device to host
|
// copy result from device to host
|
||||||
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads,
|
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpyDeviceToHost));
|
|
||||||
|
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||||
@ -148,7 +147,8 @@ void runTest(int argc, char **argv) {
|
|||||||
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
|
||||||
// write file for regression test
|
// write file for regression test
|
||||||
sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
|
sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// custom output handling when no regression test running
|
// custom output handling when no regression test running
|
||||||
// in this case check if the result is equivalent to the expected solution
|
// in this case check if the result is equivalent to the expected solution
|
||||||
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
|
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
|
||||||
|
|||||||
@ -26,8 +26,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// export C interface
|
// export C interface
|
||||||
extern "C" void computeGold(float *reference, float *idata,
|
extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
|
||||||
const unsigned int len);
|
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
//! Compute reference data set
|
//! Compute reference data set
|
||||||
@ -36,7 +35,8 @@ extern "C" void computeGold(float *reference, float *idata,
|
|||||||
//! @param idata input data as provided to device
|
//! @param idata input data as provided to device
|
||||||
//! @param len number of elements in reference / idata
|
//! @param len number of elements in reference / idata
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
void computeGold(float *reference, float *idata, const unsigned int len) {
|
void computeGold(float *reference, float *idata, const unsigned int len)
|
||||||
|
{
|
||||||
const float f_len = static_cast<float>(len);
|
const float f_len = static_cast<float>(len);
|
||||||
|
|
||||||
for (unsigned int i = 0; i < len; ++i) {
|
for (unsigned int i = 0; i < len; ++i) {
|
||||||
|
|||||||
@ -37,7 +37,6 @@
|
|||||||
|
|
||||||
// For the CUDA runtime routines (prefixed with "cuda_")
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
/**
|
/**
|
||||||
* CUDA Kernel Device code
|
* CUDA Kernel Device code
|
||||||
@ -45,8 +44,8 @@
|
|||||||
* Computes the vector addition of A and B into C. The 3 vectors have the same
|
* Computes the vector addition of A and B into C. The 3 vectors have the same
|
||||||
* number of elements numElements.
|
* number of elements numElements.
|
||||||
*/
|
*/
|
||||||
__global__ void vectorAdd(const float *A, const float *B, float *C,
|
__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
|
||||||
int numElements) {
|
{
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < numElements) {
|
if (i < numElements) {
|
||||||
@ -57,7 +56,8 @@ __global__ void vectorAdd(const float *A, const float *B, float *C,
|
|||||||
/**
|
/**
|
||||||
* Host main routine
|
* Host main routine
|
||||||
*/
|
*/
|
||||||
int main(void) {
|
int main(void)
|
||||||
|
{
|
||||||
// Error code to check return values for CUDA calls
|
// Error code to check return values for CUDA calls
|
||||||
cudaError_t err = cudaSuccess;
|
cudaError_t err = cudaSuccess;
|
||||||
|
|
||||||
@ -92,8 +92,7 @@ int main(void) {
|
|||||||
err = cudaMalloc((void **)&d_A, size);
|
err = cudaMalloc((void **)&d_A, size);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
|
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,8 +101,7 @@ int main(void) {
|
|||||||
err = cudaMalloc((void **)&d_B, size);
|
err = cudaMalloc((void **)&d_B, size);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
|
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,8 +110,7 @@ int main(void) {
|
|||||||
err = cudaMalloc((void **)&d_C, size);
|
err = cudaMalloc((void **)&d_C, size);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
|
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,32 +121,26 @@ int main(void) {
|
|||||||
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
|
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
|
||||||
"Failed to copy vector A from host to device (error code %s)!\n",
|
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
|
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
|
||||||
"Failed to copy vector B from host to device (error code %s)!\n",
|
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Launch the Vector Add CUDA Kernel
|
// Launch the Vector Add CUDA Kernel
|
||||||
int threadsPerBlock = 256;
|
int threadsPerBlock = 256;
|
||||||
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
|
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
|
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
||||||
threadsPerBlock);
|
|
||||||
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
|
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
|
||||||
err = cudaGetLastError();
|
err = cudaGetLastError();
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
|
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -159,9 +150,7 @@ int main(void) {
|
|||||||
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
|
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr,
|
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
|
||||||
"Failed to copy vector C from device to host (error code %s)!\n",
|
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -179,24 +168,21 @@ int main(void) {
|
|||||||
err = cudaFree(d_A);
|
err = cudaFree(d_A);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to free device vector A (error code %s)!\n",
|
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
err = cudaFree(d_B);
|
err = cudaFree(d_B);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to free device vector B (error code %s)!\n",
|
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
err = cudaFree(d_C);
|
err = cudaFree(d_C);
|
||||||
|
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "Failed to free device vector C (error code %s)!\n",
|
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -34,11 +34,11 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Includes
|
// Includes
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <iostream>
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda_drvapi.h>
|
#include <helper_cuda_drvapi.h>
|
||||||
@ -72,7 +72,8 @@ bool findModulePath(const char *, string &, char **, string &);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Host code
|
// Host code
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("Vector Addition (Driver API)\n");
|
printf("Vector Addition (Driver API)\n");
|
||||||
int N = 50000, devID = 0;
|
int N = 50000, devID = 0;
|
||||||
size_t size = N * sizeof(float);
|
size_t size = N * sizeof(float);
|
||||||
@ -91,7 +92,8 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,8 +106,7 @@ int main(int argc, char **argv) {
|
|||||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||||
|
|
||||||
// Get function handle from module
|
// Get function handle from module
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
|
||||||
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
|
|
||||||
|
|
||||||
// Allocate input vectors h_A and h_B in host memory
|
// Allocate input vectors h_A and h_B in host memory
|
||||||
h_A = (float *)malloc(size);
|
h_A = (float *)malloc(size);
|
||||||
@ -139,9 +140,9 @@ int main(int argc, char **argv) {
|
|||||||
void *args[] = {&d_A, &d_B, &d_C, &N};
|
void *args[] = {&d_A, &d_B, &d_C, &N};
|
||||||
|
|
||||||
// Launch the CUDA kernel
|
// Launch the CUDA kernel
|
||||||
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
|
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
|
||||||
threadsPerBlock, 1, 1, 0, NULL, args, NULL));
|
}
|
||||||
} else {
|
else {
|
||||||
// This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
|
// This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
|
||||||
// Launch (advanced method)
|
// Launch (advanced method)
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
@ -160,9 +161,8 @@ int main(int argc, char **argv) {
|
|||||||
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
|
|
||||||
// Launch the CUDA kernel
|
// Launch the CUDA kernel
|
||||||
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
|
checkCudaErrors(
|
||||||
threadsPerBlock, 1, 1, 0, NULL, NULL,
|
cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
|
||||||
argBuffer));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef _DEBUG
|
#ifdef _DEBUG
|
||||||
@ -190,7 +190,8 @@ int main(int argc, char **argv) {
|
|||||||
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
|
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
int CleanupNoFailure() {
|
int CleanupNoFailure()
|
||||||
|
{
|
||||||
// Free device memory
|
// Free device memory
|
||||||
checkCudaErrors(cuMemFree(d_A));
|
checkCudaErrors(cuMemFree(d_A));
|
||||||
checkCudaErrors(cuMemFree(d_B));
|
checkCudaErrors(cuMemFree(d_B));
|
||||||
@ -214,7 +215,8 @@ int CleanupNoFailure() {
|
|||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
// Allocates an array with random float entries.
|
// Allocates an array with random float entries.
|
||||||
void RandomInit(float *data, int n) {
|
void RandomInit(float *data, int n)
|
||||||
|
{
|
||||||
for (int i = 0; i < n; ++i) {
|
for (int i = 0; i < n; ++i) {
|
||||||
data[i] = rand() / (float)RAND_MAX;
|
data[i] = rand() / (float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -33,9 +33,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Device code
|
// Device code
|
||||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
|
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
|
||||||
float *C, int N) {
|
{
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < N) C[i] = A[i] + B[i];
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
|
|||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -29,10 +29,13 @@
|
|||||||
|
|
||||||
static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
|
static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
|
||||||
|
|
||||||
CUresult simpleMallocMultiDeviceMmap(
|
CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr,
|
||||||
CUdeviceptr *dptr, size_t *allocationSize, size_t size,
|
size_t *allocationSize,
|
||||||
|
size_t size,
|
||||||
const std::vector<CUdevice> &residentDevices,
|
const std::vector<CUdevice> &residentDevices,
|
||||||
const std::vector<CUdevice> &mappingDevices, size_t align) {
|
const std::vector<CUdevice> &mappingDevices,
|
||||||
|
size_t align)
|
||||||
|
{
|
||||||
CUresult status = CUDA_SUCCESS;
|
CUresult status = CUDA_SUCCESS;
|
||||||
size_t min_granularity = 0;
|
size_t min_granularity = 0;
|
||||||
size_t stripeSize;
|
size_t stripeSize;
|
||||||
@ -53,8 +56,7 @@ CUresult simpleMallocMultiDeviceMmap(
|
|||||||
|
|
||||||
// get the minnimum granularity for residentDevices[idx]
|
// get the minnimum granularity for residentDevices[idx]
|
||||||
prop.location.id = residentDevices[idx];
|
prop.location.id = residentDevices[idx];
|
||||||
status = cuMemGetAllocationGranularity(&granularity, &prop,
|
status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
||||||
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
|
||||||
if (status != CUDA_SUCCESS) {
|
if (status != CUDA_SUCCESS) {
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
@ -70,8 +72,7 @@ CUresult simpleMallocMultiDeviceMmap(
|
|||||||
|
|
||||||
// get the minnimum granularity for mappingDevices[idx]
|
// get the minnimum granularity for mappingDevices[idx]
|
||||||
prop.location.id = mappingDevices[idx];
|
prop.location.id = mappingDevices[idx];
|
||||||
status = cuMemGetAllocationGranularity(&granularity, &prop,
|
status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
||||||
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
|
||||||
if (status != CUDA_SUCCESS) {
|
if (status != CUDA_SUCCESS) {
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
@ -121,8 +122,7 @@ CUresult simpleMallocMultiDeviceMmap(
|
|||||||
// Since we do not need to make any other mappings of this memory or export
|
// Since we do not need to make any other mappings of this memory or export
|
||||||
// it, we no longer need and can release the allocationHandle. The
|
// it, we no longer need and can release the allocationHandle. The
|
||||||
// allocation will be kept live until it is unmapped.
|
// allocation will be kept live until it is unmapped.
|
||||||
status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
|
status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
|
||||||
allocationHandle, 0);
|
|
||||||
|
|
||||||
// the handle needs to be released even if the mapping failed.
|
// the handle needs to be released even if the mapping failed.
|
||||||
status2 = cuMemRelease(allocationHandle);
|
status2 = cuMemRelease(allocationHandle);
|
||||||
@ -157,8 +157,7 @@ CUresult simpleMallocMultiDeviceMmap(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Apply the access descriptors to the whole VA range.
|
// Apply the access descriptors to the whole VA range.
|
||||||
status = cuMemSetAccess(*dptr, size, &accessDescriptors[0],
|
status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
|
||||||
accessDescriptors.size());
|
|
||||||
if (status != CUDA_SUCCESS) {
|
if (status != CUDA_SUCCESS) {
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
@ -174,7 +173,8 @@ done:
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) {
|
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
|
||||||
|
{
|
||||||
CUresult status = CUDA_SUCCESS;
|
CUresult status = CUDA_SUCCESS;
|
||||||
|
|
||||||
// Unmap the mapped virtual memory region
|
// Unmap the mapped virtual memory region
|
||||||
|
|||||||
@ -63,10 +63,12 @@
|
|||||||
//! handle
|
//! handle
|
||||||
//! is not needed after its mappings are set up.
|
//! is not needed after its mappings are set up.
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
CUresult simpleMallocMultiDeviceMmap(
|
CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr,
|
||||||
CUdeviceptr *dptr, size_t *allocationSize, size_t size,
|
size_t *allocationSize,
|
||||||
|
size_t size,
|
||||||
const std::vector<CUdevice> &residentDevices,
|
const std::vector<CUdevice> &residentDevices,
|
||||||
const std::vector<CUdevice> &mappingDevices, size_t align = 0);
|
const std::vector<CUdevice> &mappingDevices,
|
||||||
|
size_t align = 0);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
//! Frees resources allocated by simpleMallocMultiDeviceMmap
|
//! Frees resources allocated by simpleMallocMultiDeviceMmap
|
||||||
|
|||||||
@ -36,11 +36,11 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Includes
|
// Includes
|
||||||
|
#include <cstring>
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
#include <iostream>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <cstring>
|
|
||||||
#include <iostream>
|
|
||||||
|
|
||||||
// includes, project
|
// includes, project
|
||||||
#include <helper_cuda_drvapi.h>
|
#include <helper_cuda_drvapi.h>
|
||||||
@ -76,7 +76,8 @@ void RandomInit(float *, int);
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
// collect all of the devices whose memory can be mapped from cuDevice.
|
// collect all of the devices whose memory can be mapped from cuDevice.
|
||||||
vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
|
vector<CUdevice> getBackingDevices(CUdevice cuDevice)
|
||||||
|
{
|
||||||
int num_devices;
|
int num_devices;
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceGetCount(&num_devices));
|
checkCudaErrors(cuDeviceGetCount(&num_devices));
|
||||||
@ -100,9 +101,8 @@ vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
|
|||||||
|
|
||||||
// The device needs to support virtual address management for the required
|
// The device needs to support virtual address management for the required
|
||||||
// apis to work
|
// apis to work
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(
|
||||||
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
|
cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
|
||||||
cuDevice));
|
|
||||||
if (attributeVal == 0) {
|
if (attributeVal == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -113,7 +113,8 @@ vector<CUdevice> getBackingDevices(CUdevice cuDevice) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Host code
|
// Host code
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
printf("Vector Addition (Driver API)\n");
|
printf("Vector Addition (Driver API)\n");
|
||||||
int N = 50000;
|
int N = 50000;
|
||||||
size_t size = N * sizeof(float);
|
size_t size = N * sizeof(float);
|
||||||
@ -125,11 +126,9 @@ int main(int argc, char **argv) {
|
|||||||
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
||||||
|
|
||||||
// Check that the selected device supports virtual address management
|
// Check that the selected device supports virtual address management
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(
|
||||||
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
|
cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
|
||||||
cuDevice));
|
printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
|
||||||
printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice,
|
|
||||||
attributeVal);
|
|
||||||
if (attributeVal == 0) {
|
if (attributeVal == 0) {
|
||||||
printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
|
printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
|
||||||
exit(EXIT_WAIVED);
|
exit(EXIT_WAIVED);
|
||||||
@ -152,17 +151,14 @@ int main(int argc, char **argv) {
|
|||||||
|
|
||||||
std::ostringstream fatbin;
|
std::ostringstream fatbin;
|
||||||
|
|
||||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin))
|
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||||
{
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
else
|
else {
|
||||||
{
|
|
||||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!fatbin.str().size())
|
if (!fatbin.str().size()) {
|
||||||
{
|
|
||||||
printf("fatbin file empty. exiting..\n");
|
printf("fatbin file empty. exiting..\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -207,10 +203,7 @@ int main(int argc, char **argv) {
|
|||||||
void *args[] = {&d_A, &d_B, &d_C, &N};
|
void *args[] = {&d_A, &d_B, &d_C, &N};
|
||||||
|
|
||||||
// Launch the CUDA kernel
|
// Launch the CUDA kernel
|
||||||
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
|
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
|
||||||
threadsPerBlock, 1, 1,
|
|
||||||
0,
|
|
||||||
NULL, args, NULL));
|
|
||||||
|
|
||||||
// Copy result from device memory to host memory
|
// Copy result from device memory to host memory
|
||||||
// h_C contains the result in host memory
|
// h_C contains the result in host memory
|
||||||
@ -219,12 +212,10 @@ int main(int argc, char **argv) {
|
|||||||
// Verify result
|
// Verify result
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = 0; i < N; ++i)
|
for (i = 0; i < N; ++i) {
|
||||||
{
|
|
||||||
float sum = h_A[i] + h_B[i];
|
float sum = h_A[i] + h_B[i];
|
||||||
|
|
||||||
if (fabs(h_C[i] - sum) > 1e-7f)
|
if (fabs(h_C[i] - sum) > 1e-7f) {
|
||||||
{
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -243,18 +234,15 @@ int CleanupNoFailure()
|
|||||||
checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
|
checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
|
||||||
|
|
||||||
// Free host memory
|
// Free host memory
|
||||||
if (h_A)
|
if (h_A) {
|
||||||
{
|
|
||||||
free(h_A);
|
free(h_A);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (h_B)
|
if (h_B) {
|
||||||
{
|
|
||||||
free(h_B);
|
free(h_B);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (h_C)
|
if (h_C) {
|
||||||
{
|
|
||||||
free(h_C);
|
free(h_C);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,8 +253,7 @@ int CleanupNoFailure()
|
|||||||
// Allocates an array with random float entries.
|
// Allocates an array with random float entries.
|
||||||
void RandomInit(float *data, int n)
|
void RandomInit(float *data, int n)
|
||||||
{
|
{
|
||||||
for (int i = 0; i < n; ++i)
|
for (int i = 0; i < n; ++i) {
|
||||||
{
|
|
||||||
data[i] = rand() / (float)RAND_MAX;
|
data[i] = rand() / (float)RAND_MAX;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -34,9 +34,10 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// Device code
|
// Device code
|
||||||
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B,
|
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
|
||||||
float *C, int N) {
|
{
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < N) C[i] = A[i] + B[i];
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
}
|
}
|
||||||
|
|||||||
@ -33,8 +33,8 @@
|
|||||||
* of the programming guide with some additions like error checking.
|
* of the programming guide with some additions like error checking.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
// For the CUDA runtime routines (prefixed with "cuda_")
|
// For the CUDA runtime routines (prefixed with "cuda_")
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
@ -42,13 +42,13 @@
|
|||||||
|
|
||||||
// helper functions and utilities to work with CUDA
|
// helper functions and utilities to work with CUDA
|
||||||
#include <helper_functions.h>
|
#include <helper_functions.h>
|
||||||
|
|
||||||
#include <nvrtc_helper.h>
|
#include <nvrtc_helper.h>
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Host main routine
|
* Host main routine
|
||||||
*/
|
*/
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
char *cubin, *kernel_file;
|
char *cubin, *kernel_file;
|
||||||
size_t cubinSize;
|
size_t cubinSize;
|
||||||
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
|
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
|
||||||
@ -105,19 +105,23 @@ int main(int argc, char **argv) {
|
|||||||
// Launch the Vector Add CUDA Kernel
|
// Launch the Vector Add CUDA Kernel
|
||||||
int threadsPerBlock = 256;
|
int threadsPerBlock = 256;
|
||||||
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
|
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
|
||||||
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
|
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
|
||||||
threadsPerBlock);
|
|
||||||
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
|
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
|
||||||
dim3 cudaGridSize(blocksPerGrid, 1, 1);
|
dim3 cudaGridSize(blocksPerGrid, 1, 1);
|
||||||
|
|
||||||
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
|
void *arr[] = {reinterpret_cast<void *>(&d_A),
|
||||||
|
reinterpret_cast<void *>(&d_B),
|
||||||
reinterpret_cast<void *>(&d_C),
|
reinterpret_cast<void *>(&d_C),
|
||||||
reinterpret_cast<void *>(&numElements)};
|
reinterpret_cast<void *>(&numElements)};
|
||||||
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
|
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||||
|
cudaGridSize.x,
|
||||||
|
cudaGridSize.y,
|
||||||
cudaGridSize.z, /* grid dim */
|
cudaGridSize.z, /* grid dim */
|
||||||
cudaBlockSize.x, cudaBlockSize.y,
|
cudaBlockSize.x,
|
||||||
|
cudaBlockSize.y,
|
||||||
cudaBlockSize.z, /* block dim */
|
cudaBlockSize.z, /* block dim */
|
||||||
0, 0, /* shared mem, stream */
|
0,
|
||||||
|
0, /* shared mem, stream */
|
||||||
&arr[0], /* arguments */
|
&arr[0], /* arguments */
|
||||||
0));
|
0));
|
||||||
checkCudaErrors(cuCtxSynchronize());
|
checkCudaErrors(cuCtxSynchronize());
|
||||||
|
|||||||
@ -32,8 +32,8 @@
|
|||||||
* number of elements numElements.
|
* number of elements numElements.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C,
|
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
|
||||||
int numElements) {
|
{
|
||||||
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
int i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
if (i < numElements) {
|
if (i < numElements) {
|
||||||
|
|||||||
@ -39,12 +39,10 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
// includes
|
// includes
|
||||||
|
#include <cassert>
|
||||||
|
#include <cuda.h>
|
||||||
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
|
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
|
||||||
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
|
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
|
||||||
|
|
||||||
#include <cuda.h>
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
@ -83,8 +81,7 @@ enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
|
|||||||
enum printMode { USER_READABLE, CSV };
|
enum printMode { USER_READABLE, CSV };
|
||||||
enum memoryMode { PINNED, PAGEABLE };
|
enum memoryMode { PINNED, PAGEABLE };
|
||||||
|
|
||||||
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
|
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device", "Device to Device", NULL};
|
||||||
"Device to Device", NULL};
|
|
||||||
|
|
||||||
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
|
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
|
||||||
|
|
||||||
@ -97,36 +94,62 @@ char **pArgv = NULL;
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// declaration, forward
|
// declaration, forward
|
||||||
int runTest(const int argc, const char **argv);
|
int runTest(const int argc, const char **argv);
|
||||||
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
|
void testBandwidth(unsigned int start,
|
||||||
testMode mode, memcpyKind kind, printMode printmode,
|
unsigned int end,
|
||||||
memoryMode memMode, int startDevice, int endDevice, bool wc);
|
unsigned int increment,
|
||||||
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
|
testMode mode,
|
||||||
memoryMode memMode, int startDevice, int endDevice,
|
memcpyKind kind,
|
||||||
|
printMode printmode,
|
||||||
|
memoryMode memMode,
|
||||||
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
bool wc);
|
bool wc);
|
||||||
void testBandwidthRange(unsigned int start, unsigned int end,
|
void testBandwidthQuick(unsigned int size,
|
||||||
unsigned int increment, memcpyKind kind,
|
memcpyKind kind,
|
||||||
printMode printmode, memoryMode memMode,
|
printMode printmode,
|
||||||
int startDevice, int endDevice, bool wc);
|
memoryMode memMode,
|
||||||
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
int startDevice,
|
||||||
memoryMode memMode, int startDevice, int endDevice,
|
int endDevice,
|
||||||
bool wc);
|
bool wc);
|
||||||
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
void testBandwidthRange(unsigned int start,
|
||||||
|
unsigned int end,
|
||||||
|
unsigned int increment,
|
||||||
|
memcpyKind kind,
|
||||||
|
printMode printmode,
|
||||||
|
memoryMode memMode,
|
||||||
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
bool wc);
|
bool wc);
|
||||||
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
void testBandwidthShmoo(memcpyKind kind,
|
||||||
|
printMode printmode,
|
||||||
|
memoryMode memMode,
|
||||||
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
bool wc);
|
bool wc);
|
||||||
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc);
|
||||||
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc);
|
||||||
float testDeviceToDeviceTransfer(unsigned int memSize);
|
float testDeviceToDeviceTransfer(unsigned int memSize);
|
||||||
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
|
void printResultsReadable(unsigned int *memSizes,
|
||||||
unsigned int count, memcpyKind kind,
|
double *bandwidths,
|
||||||
memoryMode memMode, int iNumDevs, bool wc);
|
unsigned int count,
|
||||||
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
|
memcpyKind kind,
|
||||||
unsigned int count, memcpyKind kind, memoryMode memMode,
|
memoryMode memMode,
|
||||||
int iNumDevs, bool wc);
|
int iNumDevs,
|
||||||
|
bool wc);
|
||||||
|
void printResultsCSV(unsigned int *memSizes,
|
||||||
|
double *bandwidths,
|
||||||
|
unsigned int count,
|
||||||
|
memcpyKind kind,
|
||||||
|
memoryMode memMode,
|
||||||
|
int iNumDevs,
|
||||||
|
bool wc);
|
||||||
void printHelp(void);
|
void printHelp(void);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
pArgc = &argc;
|
pArgc = &argc;
|
||||||
pArgv = argv;
|
pArgv = argv;
|
||||||
|
|
||||||
@ -144,8 +167,7 @@ int main(int argc, char **argv) {
|
|||||||
// finish
|
// finish
|
||||||
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
|
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
|
||||||
|
|
||||||
printf(
|
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||||
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
|
||||||
"Results may vary when GPU Boost is enabled.\n");
|
"Results may vary when GPU Boost is enabled.\n");
|
||||||
|
|
||||||
free(flush_buf);
|
free(flush_buf);
|
||||||
@ -156,7 +178,8 @@ int main(int argc, char **argv) {
|
|||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// Parse args, run the appropriate tests
|
// Parse args, run the appropriate tests
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
int runTest(const int argc, const char **argv) {
|
int runTest(const int argc, const char **argv)
|
||||||
|
{
|
||||||
int start = DEFAULT_SIZE;
|
int start = DEFAULT_SIZE;
|
||||||
int end = DEFAULT_SIZE;
|
int end = DEFAULT_SIZE;
|
||||||
int startDevice = 0;
|
int startDevice = 0;
|
||||||
@ -186,14 +209,17 @@ int runTest(const int argc, const char **argv) {
|
|||||||
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
|
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
|
||||||
if (strcmp(memModeStr, "pageable") == 0) {
|
if (strcmp(memModeStr, "pageable") == 0) {
|
||||||
memMode = PAGEABLE;
|
memMode = PAGEABLE;
|
||||||
} else if (strcmp(memModeStr, "pinned") == 0) {
|
}
|
||||||
|
else if (strcmp(memModeStr, "pinned") == 0) {
|
||||||
memMode = PINNED;
|
memMode = PINNED;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Invalid memory mode - valid modes are pageable or pinned\n");
|
printf("Invalid memory mode - valid modes are pageable or pinned\n");
|
||||||
printf("See --help for more information\n");
|
printf("See --help for more information\n");
|
||||||
return -1000;
|
return -1000;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// default - pinned memory
|
// default - pinned memory
|
||||||
memMode = PINNED;
|
memMode = PINNED;
|
||||||
}
|
}
|
||||||
@ -203,8 +229,7 @@ int runTest(const int argc, const char **argv) {
|
|||||||
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||||
|
|
||||||
if (error_id != cudaSuccess) {
|
if (error_id != cudaSuccess) {
|
||||||
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
|
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
|
||||||
cudaGetErrorString(error_id));
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -214,19 +239,19 @@ int runTest(const int argc, const char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (strcmp(device, "all") == 0) {
|
if (strcmp(device, "all") == 0) {
|
||||||
printf(
|
printf("\n!!!!!Cumulative Bandwidth to be computed from all the devices "
|
||||||
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
|
|
||||||
"!!!!!!\n\n");
|
"!!!!!!\n\n");
|
||||||
startDevice = 0;
|
startDevice = 0;
|
||||||
endDevice = deviceCount - 1;
|
endDevice = deviceCount - 1;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
startDevice = endDevice = atoi(device);
|
startDevice = endDevice = atoi(device);
|
||||||
|
|
||||||
if (startDevice >= deviceCount || startDevice < 0) {
|
if (startDevice >= deviceCount || startDevice < 0) {
|
||||||
printf(
|
printf("\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
|
||||||
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
|
|
||||||
"used !!!!!\n",
|
"used !!!!!\n",
|
||||||
startDevice, 0);
|
startDevice,
|
||||||
|
0);
|
||||||
startDevice = endDevice = 0;
|
startDevice = endDevice = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -234,8 +259,7 @@ int runTest(const int argc, const char **argv) {
|
|||||||
|
|
||||||
printf("Running on...\n\n");
|
printf("Running on...\n\n");
|
||||||
|
|
||||||
for (int currentDevice = startDevice; currentDevice <= endDevice;
|
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
|
||||||
currentDevice++) {
|
|
||||||
cudaDeviceProp deviceProp;
|
cudaDeviceProp deviceProp;
|
||||||
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
|
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
|
||||||
|
|
||||||
@ -250,9 +274,9 @@ int runTest(const int argc, const char **argv) {
|
|||||||
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
|
else {
|
||||||
cudaGetErrorString(error_id));
|
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
|
||||||
checkCudaErrors(cudaSetDevice(currentDevice));
|
checkCudaErrors(cudaSetDevice(currentDevice));
|
||||||
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
@ -264,18 +288,22 @@ int runTest(const int argc, const char **argv) {
|
|||||||
if (strcmp(modeStr, "quick") == 0) {
|
if (strcmp(modeStr, "quick") == 0) {
|
||||||
printf(" Quick Mode\n\n");
|
printf(" Quick Mode\n\n");
|
||||||
mode = QUICK_MODE;
|
mode = QUICK_MODE;
|
||||||
} else if (strcmp(modeStr, "shmoo") == 0) {
|
}
|
||||||
|
else if (strcmp(modeStr, "shmoo") == 0) {
|
||||||
printf(" Shmoo Mode\n\n");
|
printf(" Shmoo Mode\n\n");
|
||||||
mode = SHMOO_MODE;
|
mode = SHMOO_MODE;
|
||||||
} else if (strcmp(modeStr, "range") == 0) {
|
}
|
||||||
|
else if (strcmp(modeStr, "range") == 0) {
|
||||||
printf(" Range Mode\n\n");
|
printf(" Range Mode\n\n");
|
||||||
mode = RANGE_MODE;
|
mode = RANGE_MODE;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
|
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
|
||||||
printf("See --help for more information\n");
|
printf("See --help for more information\n");
|
||||||
return -3000;
|
return -3000;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// default mode - quick
|
// default mode - quick
|
||||||
printf(" Quick Mode\n\n");
|
printf(" Quick Mode\n\n");
|
||||||
mode = QUICK_MODE;
|
mode = QUICK_MODE;
|
||||||
@ -320,7 +348,8 @@ int runTest(const int argc, const char **argv) {
|
|||||||
printf("Illegal argument - start must be greater than zero\n");
|
printf("Illegal argument - start must be greater than zero\n");
|
||||||
return -4000;
|
return -4000;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Must specify a starting size in range mode\n");
|
printf("Must specify a starting size in range mode\n");
|
||||||
printf("See --help for more information\n");
|
printf("See --help for more information\n");
|
||||||
return -5000;
|
return -5000;
|
||||||
@ -338,7 +367,8 @@ int runTest(const int argc, const char **argv) {
|
|||||||
printf("Illegal argument - start is greater than end\n");
|
printf("Illegal argument - start is greater than end\n");
|
||||||
return -7000;
|
return -7000;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Must specify an end size in range mode.\n");
|
printf("Must specify an end size in range mode.\n");
|
||||||
printf("See --help for more information\n");
|
printf("See --help for more information\n");
|
||||||
return -8000;
|
return -8000;
|
||||||
@ -351,7 +381,8 @@ int runTest(const int argc, const char **argv) {
|
|||||||
printf("Illegal argument - increment must be greater than zero\n");
|
printf("Illegal argument - increment must be greater than zero\n");
|
||||||
return -9000;
|
return -9000;
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Must specify an increment in user mode\n");
|
printf("Must specify an increment in user mode\n");
|
||||||
printf("See --help for more information\n");
|
printf("See --help for more information\n");
|
||||||
return -10000;
|
return -10000;
|
||||||
@ -359,21 +390,42 @@ int runTest(const int argc, const char **argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (htod) {
|
if (htod) {
|
||||||
testBandwidth((unsigned int)start, (unsigned int)end,
|
testBandwidth((unsigned int)start,
|
||||||
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
|
(unsigned int)end,
|
||||||
memMode, startDevice, endDevice, wc);
|
(unsigned int)increment,
|
||||||
|
mode,
|
||||||
|
HOST_TO_DEVICE,
|
||||||
|
printmode,
|
||||||
|
memMode,
|
||||||
|
startDevice,
|
||||||
|
endDevice,
|
||||||
|
wc);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dtoh) {
|
if (dtoh) {
|
||||||
testBandwidth((unsigned int)start, (unsigned int)end,
|
testBandwidth((unsigned int)start,
|
||||||
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
|
(unsigned int)end,
|
||||||
memMode, startDevice, endDevice, wc);
|
(unsigned int)increment,
|
||||||
|
mode,
|
||||||
|
DEVICE_TO_HOST,
|
||||||
|
printmode,
|
||||||
|
memMode,
|
||||||
|
startDevice,
|
||||||
|
endDevice,
|
||||||
|
wc);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dtod) {
|
if (dtod) {
|
||||||
testBandwidth((unsigned int)start, (unsigned int)end,
|
testBandwidth((unsigned int)start,
|
||||||
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
|
(unsigned int)end,
|
||||||
memMode, startDevice, endDevice, wc);
|
(unsigned int)increment,
|
||||||
|
mode,
|
||||||
|
DEVICE_TO_DEVICE,
|
||||||
|
printmode,
|
||||||
|
memMode,
|
||||||
|
startDevice,
|
||||||
|
endDevice,
|
||||||
|
wc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure that we reset all CUDA Devices in question
|
// Ensure that we reset all CUDA Devices in question
|
||||||
@ -387,19 +439,24 @@ int runTest(const int argc, const char **argv) {
|
|||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// Run a bandwidth test
|
// Run a bandwidth test
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
|
void testBandwidth(unsigned int start,
|
||||||
testMode mode, memcpyKind kind, printMode printmode,
|
unsigned int end,
|
||||||
memoryMode memMode, int startDevice, int endDevice,
|
unsigned int increment,
|
||||||
bool wc) {
|
testMode mode,
|
||||||
|
memcpyKind kind,
|
||||||
|
printMode printmode,
|
||||||
|
memoryMode memMode,
|
||||||
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
|
bool wc)
|
||||||
|
{
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case QUICK_MODE:
|
case QUICK_MODE:
|
||||||
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
|
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice, endDevice, wc);
|
||||||
endDevice, wc);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case RANGE_MODE:
|
case RANGE_MODE:
|
||||||
testBandwidthRange(start, end, increment, kind, printmode, memMode,
|
testBandwidthRange(start, end, increment, kind, printmode, memMode, startDevice, endDevice, wc);
|
||||||
startDevice, endDevice, wc);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SHMOO_MODE:
|
case SHMOO_MODE:
|
||||||
@ -414,20 +471,30 @@ void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
|
|||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
// Run a quick mode bandwidth test
|
// Run a quick mode bandwidth test
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
|
void testBandwidthQuick(unsigned int size,
|
||||||
memoryMode memMode, int startDevice, int endDevice,
|
memcpyKind kind,
|
||||||
bool wc) {
|
printMode printmode,
|
||||||
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
|
memoryMode memMode,
|
||||||
startDevice, endDevice, wc);
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
|
bool wc)
|
||||||
|
{
|
||||||
|
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode, startDevice, endDevice, wc);
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////
|
||||||
// Run a range mode bandwidth test
|
// Run a range mode bandwidth test
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
void testBandwidthRange(unsigned int start, unsigned int end,
|
void testBandwidthRange(unsigned int start,
|
||||||
unsigned int increment, memcpyKind kind,
|
unsigned int end,
|
||||||
printMode printmode, memoryMode memMode,
|
unsigned int increment,
|
||||||
int startDevice, int endDevice, bool wc) {
|
memcpyKind kind,
|
||||||
|
printMode printmode,
|
||||||
|
memoryMode memMode,
|
||||||
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
|
bool wc)
|
||||||
|
{
|
||||||
// count the number of copies we're going to run
|
// count the number of copies we're going to run
|
||||||
unsigned int count = 1 + ((end - start) / increment);
|
unsigned int count = 1 + ((end - start) / increment);
|
||||||
|
|
||||||
@ -441,8 +508,7 @@ void testBandwidthRange(unsigned int start, unsigned int end,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Use the device asked by the user
|
// Use the device asked by the user
|
||||||
for (int currentDevice = startDevice; currentDevice <= endDevice;
|
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
|
||||||
currentDevice++) {
|
|
||||||
cudaSetDevice(currentDevice);
|
cudaSetDevice(currentDevice);
|
||||||
|
|
||||||
// run each of the copies
|
// run each of the copies
|
||||||
@ -467,11 +533,10 @@ void testBandwidthRange(unsigned int start, unsigned int end,
|
|||||||
|
|
||||||
// print results
|
// print results
|
||||||
if (printmode == CSV) {
|
if (printmode == CSV) {
|
||||||
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
|
||||||
(1 + endDevice - startDevice), wc);
|
}
|
||||||
} else {
|
else {
|
||||||
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
|
||||||
(1 + endDevice - startDevice), wc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
@ -482,18 +547,21 @@ void testBandwidthRange(unsigned int start, unsigned int end,
|
|||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
// Intense shmoo mode - covers a large range of values with varying increments
|
// Intense shmoo mode - covers a large range of values with varying increments
|
||||||
//////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////
|
||||||
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
void testBandwidthShmoo(memcpyKind kind,
|
||||||
memoryMode memMode, int startDevice, int endDevice,
|
printMode printmode,
|
||||||
bool wc) {
|
memoryMode memMode,
|
||||||
|
int startDevice,
|
||||||
|
int endDevice,
|
||||||
|
bool wc)
|
||||||
|
{
|
||||||
// count the number of copies to make
|
// count the number of copies to make
|
||||||
unsigned int count =
|
unsigned int count = 1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB)
|
||||||
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
|
+ ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB)
|
||||||
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
|
+ ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB)
|
||||||
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
|
+ ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB)
|
||||||
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
|
+ ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB)
|
||||||
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
|
+ ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB)
|
||||||
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
|
+ ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
|
||||||
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
|
|
||||||
|
|
||||||
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
|
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
|
||||||
double *bandwidths = (double *)malloc(count * sizeof(double));
|
double *bandwidths = (double *)malloc(count * sizeof(double));
|
||||||
@ -505,8 +573,7 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Use the device asked by the user
|
// Use the device asked by the user
|
||||||
for (int currentDevice = startDevice; currentDevice <= endDevice;
|
for (int currentDevice = startDevice; currentDevice <= endDevice; currentDevice++) {
|
||||||
currentDevice++) {
|
|
||||||
cudaSetDevice(currentDevice);
|
cudaSetDevice(currentDevice);
|
||||||
// Run the shmoo
|
// Run the shmoo
|
||||||
int iteration = 0;
|
int iteration = 0;
|
||||||
@ -515,17 +582,23 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
|||||||
while (memSize <= SHMOO_MEMSIZE_MAX) {
|
while (memSize <= SHMOO_MEMSIZE_MAX) {
|
||||||
if (memSize < SHMOO_LIMIT_20KB) {
|
if (memSize < SHMOO_LIMIT_20KB) {
|
||||||
memSize += SHMOO_INCREMENT_1KB;
|
memSize += SHMOO_INCREMENT_1KB;
|
||||||
} else if (memSize < SHMOO_LIMIT_50KB) {
|
}
|
||||||
|
else if (memSize < SHMOO_LIMIT_50KB) {
|
||||||
memSize += SHMOO_INCREMENT_2KB;
|
memSize += SHMOO_INCREMENT_2KB;
|
||||||
} else if (memSize < SHMOO_LIMIT_100KB) {
|
}
|
||||||
|
else if (memSize < SHMOO_LIMIT_100KB) {
|
||||||
memSize += SHMOO_INCREMENT_10KB;
|
memSize += SHMOO_INCREMENT_10KB;
|
||||||
} else if (memSize < SHMOO_LIMIT_1MB) {
|
}
|
||||||
|
else if (memSize < SHMOO_LIMIT_1MB) {
|
||||||
memSize += SHMOO_INCREMENT_100KB;
|
memSize += SHMOO_INCREMENT_100KB;
|
||||||
} else if (memSize < SHMOO_LIMIT_16MB) {
|
}
|
||||||
|
else if (memSize < SHMOO_LIMIT_16MB) {
|
||||||
memSize += SHMOO_INCREMENT_1MB;
|
memSize += SHMOO_INCREMENT_1MB;
|
||||||
} else if (memSize < SHMOO_LIMIT_32MB) {
|
}
|
||||||
|
else if (memSize < SHMOO_LIMIT_32MB) {
|
||||||
memSize += SHMOO_INCREMENT_2MB;
|
memSize += SHMOO_INCREMENT_2MB;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
memSize += SHMOO_INCREMENT_4MB;
|
memSize += SHMOO_INCREMENT_4MB;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -533,18 +606,15 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
|||||||
|
|
||||||
switch (kind) {
|
switch (kind) {
|
||||||
case DEVICE_TO_HOST:
|
case DEVICE_TO_HOST:
|
||||||
bandwidths[iteration] +=
|
bandwidths[iteration] += testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
|
||||||
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case HOST_TO_DEVICE:
|
case HOST_TO_DEVICE:
|
||||||
bandwidths[iteration] +=
|
bandwidths[iteration] += testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
|
||||||
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DEVICE_TO_DEVICE:
|
case DEVICE_TO_DEVICE:
|
||||||
bandwidths[iteration] +=
|
bandwidths[iteration] += testDeviceToDeviceTransfer(memSizes[iteration]);
|
||||||
testDeviceToDeviceTransfer(memSizes[iteration]);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -558,11 +628,10 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
|||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
if (CSV == printmode) {
|
if (CSV == printmode) {
|
||||||
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
|
printResultsCSV(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
|
||||||
(1 + endDevice - startDevice), wc);
|
}
|
||||||
} else {
|
else {
|
||||||
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
|
printResultsReadable(memSizes, bandwidths, count, kind, memMode, (1 + endDevice - startDevice), wc);
|
||||||
(1 + endDevice - startDevice), wc);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
@ -573,8 +642,8 @@ void testBandwidthShmoo(memcpyKind kind, printMode printmode,
|
|||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
// test the bandwidth of a device to host memcopy of a specific size
|
// test the bandwidth of a device to host memcopy of a specific size
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode, bool wc)
|
||||||
bool wc) {
|
{
|
||||||
StopWatchInterface *timer = NULL;
|
StopWatchInterface *timer = NULL;
|
||||||
float elapsedTimeInMs = 0.0f;
|
float elapsedTimeInMs = 0.0f;
|
||||||
float bandwidthInGBs = 0.0f;
|
float bandwidthInGBs = 0.0f;
|
||||||
@ -590,15 +659,14 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
if (PINNED == memMode) {
|
if (PINNED == memMode) {
|
||||||
// pinned memory mode - use special function to get OS-pinned memory
|
// pinned memory mode - use special function to get OS-pinned memory
|
||||||
#if CUDART_VERSION >= 2020
|
#if CUDART_VERSION >= 2020
|
||||||
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
|
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
|
||||||
(wc) ? cudaHostAllocWriteCombined : 0));
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
|
||||||
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
|
|
||||||
(wc) ? cudaHostAllocWriteCombined : 0));
|
|
||||||
#else
|
#else
|
||||||
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
|
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
|
||||||
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
|
||||||
#endif
|
#endif
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// pageable memory mode - use malloc
|
// pageable memory mode - use malloc
|
||||||
h_idata = (unsigned char *)malloc(memSize);
|
h_idata = (unsigned char *)malloc(memSize);
|
||||||
h_odata = (unsigned char *)malloc(memSize);
|
h_odata = (unsigned char *)malloc(memSize);
|
||||||
@ -619,16 +687,15 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
|
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
|
||||||
|
|
||||||
// initialize the device memory
|
// initialize the device memory
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
// copy data from GPU to Host
|
// copy data from GPU to Host
|
||||||
if (PINNED == memMode) {
|
if (PINNED == memMode) {
|
||||||
if (bDontUseGPUTiming) sdkStartTimer(&timer);
|
if (bDontUseGPUTiming)
|
||||||
|
sdkStartTimer(&timer);
|
||||||
checkCudaErrors(cudaEventRecord(start, 0));
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
|
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost, 0));
|
||||||
cudaMemcpyDeviceToHost, 0));
|
|
||||||
}
|
}
|
||||||
checkCudaErrors(cudaEventRecord(stop, 0));
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
@ -638,12 +705,12 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
||||||
sdkResetTimer(&timer);
|
sdkResetTimer(&timer);
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
elapsedTimeInMs = 0;
|
elapsedTimeInMs = 0;
|
||||||
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
|
||||||
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
|
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
elapsedTimeInMs += sdkGetTimerValue(&timer);
|
elapsedTimeInMs += sdkGetTimerValue(&timer);
|
||||||
sdkResetTimer(&timer);
|
sdkResetTimer(&timer);
|
||||||
@ -663,7 +730,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
if (PINNED == memMode) {
|
if (PINNED == memMode) {
|
||||||
checkCudaErrors(cudaFreeHost(h_idata));
|
checkCudaErrors(cudaFreeHost(h_idata));
|
||||||
checkCudaErrors(cudaFreeHost(h_odata));
|
checkCudaErrors(cudaFreeHost(h_odata));
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
free(h_idata);
|
free(h_idata);
|
||||||
free(h_odata);
|
free(h_odata);
|
||||||
}
|
}
|
||||||
@ -676,8 +744,8 @@ float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
//! test the bandwidth of a host to device memcopy of a specific size
|
//! test the bandwidth of a host to device memcopy of a specific size
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode, bool wc)
|
||||||
bool wc) {
|
{
|
||||||
StopWatchInterface *timer = NULL;
|
StopWatchInterface *timer = NULL;
|
||||||
float elapsedTimeInMs = 0.0f;
|
float elapsedTimeInMs = 0.0f;
|
||||||
float bandwidthInGBs = 0.0f;
|
float bandwidthInGBs = 0.0f;
|
||||||
@ -692,13 +760,13 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
if (PINNED == memMode) {
|
if (PINNED == memMode) {
|
||||||
#if CUDART_VERSION >= 2020
|
#if CUDART_VERSION >= 2020
|
||||||
// pinned memory mode - use special function to get OS-pinned memory
|
// pinned memory mode - use special function to get OS-pinned memory
|
||||||
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
|
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize, (wc) ? cudaHostAllocWriteCombined : 0));
|
||||||
(wc) ? cudaHostAllocWriteCombined : 0));
|
|
||||||
#else
|
#else
|
||||||
// pinned memory mode - use special function to get OS-pinned memory
|
// pinned memory mode - use special function to get OS-pinned memory
|
||||||
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
|
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
|
||||||
#endif
|
#endif
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// pageable memory mode - use malloc
|
// pageable memory mode - use malloc
|
||||||
h_odata = (unsigned char *)malloc(memSize);
|
h_odata = (unsigned char *)malloc(memSize);
|
||||||
|
|
||||||
@ -732,11 +800,11 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
|
|
||||||
// copy host memory to device memory
|
// copy host memory to device memory
|
||||||
if (PINNED == memMode) {
|
if (PINNED == memMode) {
|
||||||
if (bDontUseGPUTiming) sdkStartTimer(&timer);
|
if (bDontUseGPUTiming)
|
||||||
|
sdkStartTimer(&timer);
|
||||||
checkCudaErrors(cudaEventRecord(start, 0));
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
|
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize, cudaMemcpyHostToDevice, 0));
|
||||||
cudaMemcpyHostToDevice, 0));
|
|
||||||
}
|
}
|
||||||
checkCudaErrors(cudaEventRecord(stop, 0));
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
checkCudaErrors(cudaDeviceSynchronize());
|
checkCudaErrors(cudaDeviceSynchronize());
|
||||||
@ -746,12 +814,12 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
elapsedTimeInMs = sdkGetTimerValue(&timer);
|
||||||
sdkResetTimer(&timer);
|
sdkResetTimer(&timer);
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
elapsedTimeInMs = 0;
|
elapsedTimeInMs = 0;
|
||||||
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
|
|
||||||
sdkStopTimer(&timer);
|
sdkStopTimer(&timer);
|
||||||
elapsedTimeInMs += sdkGetTimerValue(&timer);
|
elapsedTimeInMs += sdkGetTimerValue(&timer);
|
||||||
sdkResetTimer(&timer);
|
sdkResetTimer(&timer);
|
||||||
@ -770,7 +838,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
|
|
||||||
if (PINNED == memMode) {
|
if (PINNED == memMode) {
|
||||||
checkCudaErrors(cudaFreeHost(h_odata));
|
checkCudaErrors(cudaFreeHost(h_odata));
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
free(h_odata);
|
free(h_odata);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -784,7 +853,8 @@ float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
|
|||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
//! test the bandwidth of a device to device memcopy of a specific size
|
//! test the bandwidth of a device to device memcopy of a specific size
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
float testDeviceToDeviceTransfer(unsigned int memSize) {
|
float testDeviceToDeviceTransfer(unsigned int memSize)
|
||||||
|
{
|
||||||
StopWatchInterface *timer = NULL;
|
StopWatchInterface *timer = NULL;
|
||||||
float elapsedTimeInMs = 0.0f;
|
float elapsedTimeInMs = 0.0f;
|
||||||
float bandwidthInGBs = 0.0f;
|
float bandwidthInGBs = 0.0f;
|
||||||
@ -814,16 +884,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
|
|||||||
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
|
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
|
||||||
|
|
||||||
// initialize memory
|
// initialize memory
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
|
||||||
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
|
|
||||||
|
|
||||||
// run the memcopy
|
// run the memcopy
|
||||||
sdkStartTimer(&timer);
|
sdkStartTimer(&timer);
|
||||||
checkCudaErrors(cudaEventRecord(start, 0));
|
checkCudaErrors(cudaEventRecord(start, 0));
|
||||||
|
|
||||||
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
|
||||||
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
checkCudaErrors(cudaEventRecord(stop, 0));
|
checkCudaErrors(cudaEventRecord(stop, 0));
|
||||||
@ -860,9 +928,14 @@ float testDeviceToDeviceTransfer(unsigned int memSize) {
|
|||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
// print results in an easily read format
|
// print results in an easily read format
|
||||||
////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////
|
||||||
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
|
void printResultsReadable(unsigned int *memSizes,
|
||||||
unsigned int count, memcpyKind kind,
|
double *bandwidths,
|
||||||
memoryMode memMode, int iNumDevs, bool wc) {
|
unsigned int count,
|
||||||
|
memcpyKind kind,
|
||||||
|
memoryMode memMode,
|
||||||
|
int iNumDevs,
|
||||||
|
bool wc)
|
||||||
|
{
|
||||||
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
|
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
|
||||||
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
|
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
|
||||||
|
|
||||||
@ -874,35 +947,41 @@ void printResultsReadable(unsigned int *memSizes, double *bandwidths,
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
for (i = 0; i < (count - 1); i++) {
|
for (i = 0; i < (count - 1); i++) {
|
||||||
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
|
printf(" %u\t\t\t%s%.1f\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
|
||||||
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
|
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i], (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
|
||||||
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// print results in a database format
|
// print results in a database format
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
|
void printResultsCSV(unsigned int *memSizes,
|
||||||
unsigned int count, memcpyKind kind, memoryMode memMode,
|
double *bandwidths,
|
||||||
int iNumDevs, bool wc) {
|
unsigned int count,
|
||||||
|
memcpyKind kind,
|
||||||
|
memoryMode memMode,
|
||||||
|
int iNumDevs,
|
||||||
|
bool wc)
|
||||||
|
{
|
||||||
std::string sConfig;
|
std::string sConfig;
|
||||||
|
|
||||||
// log config information
|
// log config information
|
||||||
if (kind == DEVICE_TO_DEVICE) {
|
if (kind == DEVICE_TO_DEVICE) {
|
||||||
sConfig += "D2D";
|
sConfig += "D2D";
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
if (kind == DEVICE_TO_HOST) {
|
if (kind == DEVICE_TO_HOST) {
|
||||||
sConfig += "D2H";
|
sConfig += "D2H";
|
||||||
} else if (kind == HOST_TO_DEVICE) {
|
}
|
||||||
|
else if (kind == HOST_TO_DEVICE) {
|
||||||
sConfig += "H2D";
|
sConfig += "H2D";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (memMode == PAGEABLE) {
|
if (memMode == PAGEABLE) {
|
||||||
sConfig += "-Paged";
|
sConfig += "-Paged";
|
||||||
} else if (memMode == PINNED) {
|
}
|
||||||
|
else if (memMode == PINNED) {
|
||||||
sConfig += "-Pinned";
|
sConfig += "-Pinned";
|
||||||
|
|
||||||
if (wc) {
|
if (wc) {
|
||||||
@ -916,27 +995,28 @@ void printResultsCSV(unsigned int *memSizes, double *bandwidths,
|
|||||||
|
|
||||||
for (i = 0; i < count; i++) {
|
for (i = 0; i < count; i++) {
|
||||||
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
|
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
|
||||||
printf(
|
printf("bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
|
||||||
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
|
|
||||||
"bytes, NumDevsUsed = %d\n",
|
"bytes, NumDevsUsed = %d\n",
|
||||||
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
|
sConfig.c_str(),
|
||||||
|
bandwidths[i],
|
||||||
|
dSeconds,
|
||||||
|
memSizes[i],
|
||||||
|
iNumDevs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
// Print help screen
|
// Print help screen
|
||||||
///////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////
|
||||||
void printHelp(void) {
|
void printHelp(void)
|
||||||
|
{
|
||||||
printf("Usage: bandwidthTest [OPTION]...\n");
|
printf("Usage: bandwidthTest [OPTION]...\n");
|
||||||
printf(
|
printf("Test the bandwidth for device to host, host to device, and device to "
|
||||||
"Test the bandwidth for device to host, host to device, and device to "
|
|
||||||
"device transfers\n");
|
"device transfers\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf(
|
printf("Example: measure the bandwidth of device to host pinned memory copies "
|
||||||
"Example: measure the bandwidth of device to host pinned memory copies "
|
|
||||||
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
|
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
|
||||||
printf(
|
printf("./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
|
||||||
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
|
|
||||||
"--increment=1024 --dtoh\n");
|
"--increment=1024 --dtoh\n");
|
||||||
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|||||||
@ -32,7 +32,6 @@
|
|||||||
|
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
@ -46,16 +45,13 @@ char **pArgv = NULL;
|
|||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
|
||||||
// This function wraps the CUDA Driver API into a template function
|
// This function wraps the CUDA Driver API into a template function
|
||||||
template <class T>
|
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
|
{
|
||||||
int device) {
|
|
||||||
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
|
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
|
||||||
|
|
||||||
if (CUDA_SUCCESS != error) {
|
if (CUDA_SUCCESS != error) {
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr,
|
stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
|
||||||
"cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
|
|
||||||
error, __FILE__, __LINE__);
|
|
||||||
|
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -66,20 +62,19 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
pArgc = &argc;
|
pArgc = &argc;
|
||||||
pArgv = argv;
|
pArgv = argv;
|
||||||
|
|
||||||
printf("%s Starting...\n\n", argv[0]);
|
printf("%s Starting...\n\n", argv[0]);
|
||||||
printf(
|
printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
|
||||||
" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
|
|
||||||
|
|
||||||
int deviceCount = 0;
|
int deviceCount = 0;
|
||||||
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
|
||||||
|
|
||||||
if (error_id != cudaSuccess) {
|
if (error_id != cudaSuccess) {
|
||||||
printf("cudaGetDeviceCount returned %d\n-> %s\n",
|
printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
|
||||||
static_cast<int>(error_id), cudaGetErrorString(error_id));
|
|
||||||
printf("Result = FAIL\n");
|
printf("Result = FAIL\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -87,7 +82,8 @@ int main(int argc, char **argv) {
|
|||||||
// This function call returns 0 if there are no CUDA capable devices.
|
// This function call returns 0 if there are no CUDA capable devices.
|
||||||
if (deviceCount == 0) {
|
if (deviceCount == 0) {
|
||||||
printf("There are no available device(s) that support CUDA\n");
|
printf("There are no available device(s) that support CUDA\n");
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,20 +100,23 @@ int main(int argc, char **argv) {
|
|||||||
cudaDriverGetVersion(&driverVersion);
|
cudaDriverGetVersion(&driverVersion);
|
||||||
cudaRuntimeGetVersion(&runtimeVersion);
|
cudaRuntimeGetVersion(&runtimeVersion);
|
||||||
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
|
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
|
||||||
driverVersion / 1000, (driverVersion % 100) / 10,
|
driverVersion / 1000,
|
||||||
runtimeVersion / 1000, (runtimeVersion % 100) / 10);
|
(driverVersion % 100) / 10,
|
||||||
printf(" CUDA Capability Major/Minor version number: %d.%d\n",
|
runtimeVersion / 1000,
|
||||||
deviceProp.major, deviceProp.minor);
|
(runtimeVersion % 100) / 10);
|
||||||
|
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
|
||||||
|
|
||||||
char msg[256];
|
char msg[256];
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
sprintf_s(msg, sizeof(msg),
|
sprintf_s(msg,
|
||||||
|
sizeof(msg),
|
||||||
" Total amount of global memory: %.0f MBytes "
|
" Total amount of global memory: %.0f MBytes "
|
||||||
"(%llu bytes)\n",
|
"(%llu bytes)\n",
|
||||||
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
|
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
|
||||||
(unsigned long long)deviceProp.totalGlobalMem);
|
(unsigned long long)deviceProp.totalGlobalMem);
|
||||||
#else
|
#else
|
||||||
snprintf(msg, sizeof(msg),
|
snprintf(msg,
|
||||||
|
sizeof(msg),
|
||||||
" Total amount of global memory: %.0f MBytes "
|
" Total amount of global memory: %.0f MBytes "
|
||||||
"(%llu bytes)\n",
|
"(%llu bytes)\n",
|
||||||
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
|
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
|
||||||
@ -128,121 +127,100 @@ int main(int argc, char **argv) {
|
|||||||
printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n",
|
printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||||
deviceProp.multiProcessorCount,
|
deviceProp.multiProcessorCount,
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
|
||||||
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
|
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
|
||||||
deviceProp.multiProcessorCount);
|
printf(" GPU Max Clock rate: %.0f MHz (%0.2f "
|
||||||
printf(
|
|
||||||
" GPU Max Clock rate: %.0f MHz (%0.2f "
|
|
||||||
"GHz)\n",
|
"GHz)\n",
|
||||||
deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
|
deviceProp.clockRate * 1e-3f,
|
||||||
|
deviceProp.clockRate * 1e-6f);
|
||||||
|
|
||||||
#if CUDART_VERSION >= 5000
|
#if CUDART_VERSION >= 5000
|
||||||
// This is supported in CUDA 5.0 (runtime API device properties)
|
// This is supported in CUDA 5.0 (runtime API device properties)
|
||||||
printf(" Memory Clock rate: %.0f Mhz\n",
|
printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
|
||||||
deviceProp.memoryClockRate * 1e-3f);
|
printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
|
||||||
printf(" Memory Bus Width: %d-bit\n",
|
|
||||||
deviceProp.memoryBusWidth);
|
|
||||||
|
|
||||||
if (deviceProp.l2CacheSize) {
|
if (deviceProp.l2CacheSize) {
|
||||||
printf(" L2 Cache Size: %d bytes\n",
|
printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
|
||||||
deviceProp.l2CacheSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// This only available in CUDA 4.0-4.2 (but these were only exposed in the
|
// This only available in CUDA 4.0-4.2 (but these were only exposed in the
|
||||||
// CUDA Driver API)
|
// CUDA Driver API)
|
||||||
int memoryClock;
|
int memoryClock;
|
||||||
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
|
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
|
||||||
dev);
|
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
|
||||||
printf(" Memory Clock rate: %.0f Mhz\n",
|
|
||||||
memoryClock * 1e-3f);
|
|
||||||
int memBusWidth;
|
int memBusWidth;
|
||||||
getCudaAttribute<int>(&memBusWidth,
|
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
||||||
printf(" Memory Bus Width: %d-bit\n",
|
|
||||||
memBusWidth);
|
|
||||||
int L2CacheSize;
|
int L2CacheSize;
|
||||||
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
||||||
|
|
||||||
if (L2CacheSize) {
|
if (L2CacheSize) {
|
||||||
printf(" L2 Cache Size: %d bytes\n",
|
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
||||||
L2CacheSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printf(
|
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
|
||||||
" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
|
|
||||||
"%d), 3D=(%d, %d, %d)\n",
|
"%d), 3D=(%d, %d, %d)\n",
|
||||||
deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
|
deviceProp.maxTexture1D,
|
||||||
deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
|
deviceProp.maxTexture2D[0],
|
||||||
deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
|
deviceProp.maxTexture2D[1],
|
||||||
printf(
|
deviceProp.maxTexture3D[0],
|
||||||
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
deviceProp.maxTexture3D[1],
|
||||||
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
|
deviceProp.maxTexture3D[2]);
|
||||||
printf(
|
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
||||||
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
|
deviceProp.maxTexture1DLayered[0],
|
||||||
|
deviceProp.maxTexture1DLayered[1]);
|
||||||
|
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
|
||||||
"layers\n",
|
"layers\n",
|
||||||
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
|
deviceProp.maxTexture2DLayered[0],
|
||||||
|
deviceProp.maxTexture2DLayered[1],
|
||||||
deviceProp.maxTexture2DLayered[2]);
|
deviceProp.maxTexture2DLayered[2]);
|
||||||
|
|
||||||
printf(" Total amount of constant memory: %zu bytes\n",
|
printf(" Total amount of constant memory: %zu bytes\n", deviceProp.totalConstMem);
|
||||||
deviceProp.totalConstMem);
|
printf(" Total amount of shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock);
|
||||||
printf(" Total amount of shared memory per block: %zu bytes\n",
|
printf(" Total shared memory per multiprocessor: %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
|
||||||
deviceProp.sharedMemPerBlock);
|
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
|
||||||
printf(" Total shared memory per multiprocessor: %zu bytes\n",
|
printf(" Warp size: %d\n", deviceProp.warpSize);
|
||||||
deviceProp.sharedMemPerMultiprocessor);
|
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
|
||||||
printf(" Total number of registers available per block: %d\n",
|
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
|
||||||
deviceProp.regsPerBlock);
|
|
||||||
printf(" Warp size: %d\n",
|
|
||||||
deviceProp.warpSize);
|
|
||||||
printf(" Maximum number of threads per multiprocessor: %d\n",
|
|
||||||
deviceProp.maxThreadsPerMultiProcessor);
|
|
||||||
printf(" Maximum number of threads per block: %d\n",
|
|
||||||
deviceProp.maxThreadsPerBlock);
|
|
||||||
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
||||||
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
|
deviceProp.maxThreadsDim[0],
|
||||||
|
deviceProp.maxThreadsDim[1],
|
||||||
deviceProp.maxThreadsDim[2]);
|
deviceProp.maxThreadsDim[2]);
|
||||||
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
||||||
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
|
deviceProp.maxGridSize[0],
|
||||||
|
deviceProp.maxGridSize[1],
|
||||||
deviceProp.maxGridSize[2]);
|
deviceProp.maxGridSize[2]);
|
||||||
printf(" Maximum memory pitch: %zu bytes\n",
|
printf(" Maximum memory pitch: %zu bytes\n", deviceProp.memPitch);
|
||||||
deviceProp.memPitch);
|
printf(" Texture alignment: %zu bytes\n", deviceProp.textureAlignment);
|
||||||
printf(" Texture alignment: %zu bytes\n",
|
printf(" Concurrent copy and kernel execution: %s with %d copy "
|
||||||
deviceProp.textureAlignment);
|
|
||||||
printf(
|
|
||||||
" Concurrent copy and kernel execution: %s with %d copy "
|
|
||||||
"engine(s)\n",
|
"engine(s)\n",
|
||||||
(deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
|
(deviceProp.deviceOverlap ? "Yes" : "No"),
|
||||||
|
deviceProp.asyncEngineCount);
|
||||||
printf(" Run time limit on kernels: %s\n",
|
printf(" Run time limit on kernels: %s\n",
|
||||||
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
|
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
|
||||||
printf(" Integrated GPU sharing Host Memory: %s\n",
|
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
|
||||||
deviceProp.integrated ? "Yes" : "No");
|
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
|
||||||
printf(" Support host page-locked memory mapping: %s\n",
|
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
|
||||||
deviceProp.canMapHostMemory ? "Yes" : "No");
|
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
|
||||||
printf(" Alignment requirement for Surfaces: %s\n",
|
|
||||||
deviceProp.surfaceAlignment ? "Yes" : "No");
|
|
||||||
printf(" Device has ECC support: %s\n",
|
|
||||||
deviceProp.ECCEnabled ? "Enabled" : "Disabled");
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
|
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
|
||||||
deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
|
deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
|
||||||
: "WDDM (Windows Display Driver Model)");
|
|
||||||
#endif
|
#endif
|
||||||
printf(" Device supports Unified Addressing (UVA): %s\n",
|
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
|
||||||
deviceProp.unifiedAddressing ? "Yes" : "No");
|
printf(" Device supports Managed Memory: %s\n", deviceProp.managedMemory ? "Yes" : "No");
|
||||||
printf(" Device supports Managed Memory: %s\n",
|
|
||||||
deviceProp.managedMemory ? "Yes" : "No");
|
|
||||||
printf(" Device supports Compute Preemption: %s\n",
|
printf(" Device supports Compute Preemption: %s\n",
|
||||||
deviceProp.computePreemptionSupported ? "Yes" : "No");
|
deviceProp.computePreemptionSupported ? "Yes" : "No");
|
||||||
printf(" Supports Cooperative Kernel Launch: %s\n",
|
printf(" Supports Cooperative Kernel Launch: %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
|
||||||
deviceProp.cooperativeLaunch ? "Yes" : "No");
|
|
||||||
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
|
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
|
||||||
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
|
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
|
||||||
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
|
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
|
||||||
deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
|
deviceProp.pciDomainID,
|
||||||
|
deviceProp.pciBusID,
|
||||||
|
deviceProp.pciDeviceID);
|
||||||
|
|
||||||
const char *sComputeMode[] = {
|
const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
|
||||||
"Default (multiple host threads can use ::cudaSetDevice() with device "
|
|
||||||
"simultaneously)",
|
"simultaneously)",
|
||||||
"Exclusive (only one host thread in one process is able to use "
|
"Exclusive (only one host thread in one process is able to use "
|
||||||
"::cudaSetDevice() with this device)",
|
"::cudaSetDevice() with this device)",
|
||||||
@ -250,7 +228,8 @@ int main(int argc, char **argv) {
|
|||||||
"device)",
|
"device)",
|
||||||
"Exclusive Process (many threads in one process is able to use "
|
"Exclusive Process (many threads in one process is able to use "
|
||||||
"::cudaSetDevice() with this device)",
|
"::cudaSetDevice() with this device)",
|
||||||
"Unknown", NULL};
|
"Unknown",
|
||||||
|
NULL};
|
||||||
printf(" Compute Mode:\n");
|
printf(" Compute Mode:\n");
|
||||||
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
|
||||||
}
|
}
|
||||||
@ -286,10 +265,12 @@ int main(int argc, char **argv) {
|
|||||||
if (gpuid[i] == gpuid[j]) {
|
if (gpuid[i] == gpuid[j]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
checkCudaErrors(
|
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
|
||||||
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
|
|
||||||
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
|
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
|
||||||
prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
|
prop[gpuid[i]].name,
|
||||||
|
gpuid[i],
|
||||||
|
prop[gpuid[j]].name,
|
||||||
|
gpuid[j],
|
||||||
can_access_peer ? "Yes" : "No");
|
can_access_peer ? "Yes" : "No");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -306,22 +287,18 @@ int main(int argc, char **argv) {
|
|||||||
// driver version
|
// driver version
|
||||||
sProfileString += ", CUDA Driver Version = ";
|
sProfileString += ", CUDA Driver Version = ";
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
|
sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
|
||||||
(driverVersion % 100) / 10);
|
|
||||||
#else
|
#else
|
||||||
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
|
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
|
||||||
(driverVersion % 100) / 10);
|
|
||||||
#endif
|
#endif
|
||||||
sProfileString += cTemp;
|
sProfileString += cTemp;
|
||||||
|
|
||||||
// Runtime version
|
// Runtime version
|
||||||
sProfileString += ", CUDA Runtime Version = ";
|
sProfileString += ", CUDA Runtime Version = ";
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
|
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
|
||||||
(runtimeVersion % 100) / 10);
|
|
||||||
#else
|
#else
|
||||||
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
|
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
|
||||||
(runtimeVersion % 100) / 10);
|
|
||||||
#endif
|
#endif
|
||||||
sProfileString += cTemp;
|
sProfileString += cTemp;
|
||||||
|
|
||||||
|
|||||||
@ -30,17 +30,17 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
// includes, system
|
// includes, system
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
#include <helper_cuda_drvapi.h>
|
#include <helper_cuda_drvapi.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
// Program main
|
// Program main
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
CUdevice dev;
|
CUdevice dev;
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
int deviceCount = 0;
|
int deviceCount = 0;
|
||||||
@ -58,15 +58,14 @@ int main(int argc, char **argv) {
|
|||||||
// This function call returns 0 if there are no CUDA capable devices.
|
// This function call returns 0 if there are no CUDA capable devices.
|
||||||
if (deviceCount == 0) {
|
if (deviceCount == 0) {
|
||||||
printf("There are no available device(s) that support CUDA\n");
|
printf("There are no available device(s) that support CUDA\n");
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
printf("Detected %d CUDA Capable device(s)\n", deviceCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (dev = 0; dev < deviceCount; ++dev) {
|
for (dev = 0; dev < deviceCount; ++dev) {
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
|
|
||||||
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
|
checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
|
||||||
|
|
||||||
@ -75,9 +74,9 @@ int main(int argc, char **argv) {
|
|||||||
int driverVersion = 0;
|
int driverVersion = 0;
|
||||||
checkCudaErrors(cuDriverGetVersion(&driverVersion));
|
checkCudaErrors(cuDriverGetVersion(&driverVersion));
|
||||||
printf(" CUDA Driver Version: %d.%d\n",
|
printf(" CUDA Driver Version: %d.%d\n",
|
||||||
driverVersion / 1000, (driverVersion % 100) / 10);
|
driverVersion / 1000,
|
||||||
printf(" CUDA Capability Major/Minor version number: %d.%d\n", major,
|
(driverVersion % 100) / 10);
|
||||||
minor);
|
printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor);
|
||||||
|
|
||||||
size_t totalGlobalMem;
|
size_t totalGlobalMem;
|
||||||
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
|
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
|
||||||
@ -91,231 +90,169 @@ int main(int argc, char **argv) {
|
|||||||
printf("%s", msg);
|
printf("%s", msg);
|
||||||
|
|
||||||
int multiProcessorCount;
|
int multiProcessorCount;
|
||||||
getCudaAttribute<int>(&multiProcessorCount,
|
getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
|
|
||||||
|
|
||||||
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
|
||||||
multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
|
multiProcessorCount,
|
||||||
|
_ConvertSMVer2CoresDRV(major, minor),
|
||||||
_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
|
_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
|
||||||
|
|
||||||
int clockRate;
|
int clockRate;
|
||||||
getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
|
getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
|
||||||
printf(
|
printf(" GPU Max Clock rate: %.0f MHz (%0.2f "
|
||||||
" GPU Max Clock rate: %.0f MHz (%0.2f "
|
|
||||||
"GHz)\n",
|
"GHz)\n",
|
||||||
clockRate * 1e-3f, clockRate * 1e-6f);
|
clockRate * 1e-3f,
|
||||||
|
clockRate * 1e-6f);
|
||||||
int memoryClock;
|
int memoryClock;
|
||||||
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
|
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
|
||||||
dev);
|
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
|
||||||
printf(" Memory Clock rate: %.0f Mhz\n",
|
|
||||||
memoryClock * 1e-3f);
|
|
||||||
int memBusWidth;
|
int memBusWidth;
|
||||||
getCudaAttribute<int>(&memBusWidth,
|
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
|
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
|
||||||
printf(" Memory Bus Width: %d-bit\n",
|
|
||||||
memBusWidth);
|
|
||||||
int L2CacheSize;
|
int L2CacheSize;
|
||||||
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
|
||||||
|
|
||||||
if (L2CacheSize) {
|
if (L2CacheSize) {
|
||||||
printf(" L2 Cache Size: %d bytes\n",
|
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
|
||||||
L2CacheSize);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int maxTex1D, maxTex2D[2], maxTex3D[3];
|
int maxTex1D, maxTex2D[2], maxTex3D[3];
|
||||||
getCudaAttribute<int>(&maxTex1D,
|
getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
|
getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
|
||||||
getCudaAttribute<int>(&maxTex2D[0],
|
getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
|
getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
|
||||||
getCudaAttribute<int>(&maxTex2D[1],
|
getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
|
getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
|
||||||
getCudaAttribute<int>(&maxTex3D[0],
|
printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) "
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
|
|
||||||
getCudaAttribute<int>(&maxTex3D[1],
|
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
|
|
||||||
getCudaAttribute<int>(&maxTex3D[2],
|
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
|
|
||||||
printf(
|
|
||||||
" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) "
|
|
||||||
"3D=(%d, %d, %d)\n",
|
"3D=(%d, %d, %d)\n",
|
||||||
maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
|
maxTex1D,
|
||||||
|
maxTex2D[0],
|
||||||
|
maxTex2D[1],
|
||||||
|
maxTex3D[0],
|
||||||
|
maxTex3D[1],
|
||||||
maxTex3D[2]);
|
maxTex3D[2]);
|
||||||
|
|
||||||
int maxTex1DLayered[2];
|
int maxTex1DLayered[2];
|
||||||
getCudaAttribute<int>(&maxTex1DLayered[0],
|
getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
|
getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
|
||||||
dev);
|
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
||||||
getCudaAttribute<int>(&maxTex1DLayered[1],
|
maxTex1DLayered[0],
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
|
maxTex1DLayered[1]);
|
||||||
dev);
|
|
||||||
printf(
|
|
||||||
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
|
|
||||||
maxTex1DLayered[0], maxTex1DLayered[1]);
|
|
||||||
|
|
||||||
int maxTex2DLayered[3];
|
int maxTex2DLayered[3];
|
||||||
getCudaAttribute<int>(&maxTex2DLayered[0],
|
getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
|
getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
|
||||||
dev);
|
getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
|
||||||
getCudaAttribute<int>(&maxTex2DLayered[1],
|
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
|
|
||||||
dev);
|
|
||||||
getCudaAttribute<int>(&maxTex2DLayered[2],
|
|
||||||
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
|
|
||||||
dev);
|
|
||||||
printf(
|
|
||||||
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
|
|
||||||
"layers\n",
|
"layers\n",
|
||||||
maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
|
maxTex2DLayered[0],
|
||||||
|
maxTex2DLayered[1],
|
||||||
|
maxTex2DLayered[2]);
|
||||||
|
|
||||||
int totalConstantMemory;
|
int totalConstantMemory;
|
||||||
getCudaAttribute<int>(&totalConstantMemory,
|
getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
|
printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory);
|
||||||
printf(" Total amount of constant memory: %u bytes\n",
|
|
||||||
totalConstantMemory);
|
|
||||||
int sharedMemPerBlock;
|
int sharedMemPerBlock;
|
||||||
getCudaAttribute<int>(&sharedMemPerBlock,
|
getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
|
printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock);
|
||||||
printf(" Total amount of shared memory per block: %u bytes\n",
|
|
||||||
sharedMemPerBlock);
|
|
||||||
int regsPerBlock;
|
int regsPerBlock;
|
||||||
getCudaAttribute<int>(®sPerBlock,
|
getCudaAttribute<int>(®sPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
|
printf(" Total number of registers available per block: %d\n", regsPerBlock);
|
||||||
printf(" Total number of registers available per block: %d\n",
|
|
||||||
regsPerBlock);
|
|
||||||
int warpSize;
|
int warpSize;
|
||||||
getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
|
getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
|
||||||
printf(" Warp size: %d\n", warpSize);
|
printf(" Warp size: %d\n", warpSize);
|
||||||
int maxThreadsPerMultiProcessor;
|
int maxThreadsPerMultiProcessor;
|
||||||
getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
|
getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
|
printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor);
|
||||||
dev);
|
|
||||||
printf(" Maximum number of threads per multiprocessor: %d\n",
|
|
||||||
maxThreadsPerMultiProcessor);
|
|
||||||
int maxThreadsPerBlock;
|
int maxThreadsPerBlock;
|
||||||
getCudaAttribute<int>(&maxThreadsPerBlock,
|
getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
|
printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock);
|
||||||
printf(" Maximum number of threads per block: %d\n",
|
|
||||||
maxThreadsPerBlock);
|
|
||||||
|
|
||||||
int blockDim[3];
|
int blockDim[3];
|
||||||
getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
|
getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
|
||||||
dev);
|
getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
|
||||||
getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
|
getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
|
||||||
dev);
|
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
|
||||||
getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
|
|
||||||
dev);
|
|
||||||
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
|
|
||||||
blockDim[0], blockDim[1], blockDim[2]);
|
|
||||||
int gridDim[3];
|
int gridDim[3];
|
||||||
getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
|
getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
|
||||||
getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
|
getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
|
||||||
getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
|
getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
|
||||||
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
|
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
|
||||||
gridDim[0], gridDim[1], gridDim[2]);
|
|
||||||
|
|
||||||
int textureAlign;
|
int textureAlign;
|
||||||
getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
|
getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
|
||||||
dev);
|
printf(" Texture alignment: %u bytes\n", textureAlign);
|
||||||
printf(" Texture alignment: %u bytes\n",
|
|
||||||
textureAlign);
|
|
||||||
|
|
||||||
int memPitch;
|
int memPitch;
|
||||||
getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
|
getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
|
||||||
printf(" Maximum memory pitch: %u bytes\n",
|
printf(" Maximum memory pitch: %u bytes\n", memPitch);
|
||||||
memPitch);
|
|
||||||
|
|
||||||
int gpuOverlap;
|
int gpuOverlap;
|
||||||
getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
|
getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
|
||||||
|
|
||||||
int asyncEngineCount;
|
int asyncEngineCount;
|
||||||
getCudaAttribute<int>(&asyncEngineCount,
|
getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
|
printf(" Concurrent copy and kernel execution: %s with %d copy "
|
||||||
printf(
|
|
||||||
" Concurrent copy and kernel execution: %s with %d copy "
|
|
||||||
"engine(s)\n",
|
"engine(s)\n",
|
||||||
(gpuOverlap ? "Yes" : "No"), asyncEngineCount);
|
(gpuOverlap ? "Yes" : "No"),
|
||||||
|
asyncEngineCount);
|
||||||
|
|
||||||
int kernelExecTimeoutEnabled;
|
int kernelExecTimeoutEnabled;
|
||||||
getCudaAttribute<int>(&kernelExecTimeoutEnabled,
|
getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
|
printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
|
||||||
printf(" Run time limit on kernels: %s\n",
|
|
||||||
kernelExecTimeoutEnabled ? "Yes" : "No");
|
|
||||||
int integrated;
|
int integrated;
|
||||||
getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
|
getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
|
||||||
printf(" Integrated GPU sharing Host Memory: %s\n",
|
printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No");
|
||||||
integrated ? "Yes" : "No");
|
|
||||||
int canMapHostMemory;
|
int canMapHostMemory;
|
||||||
getCudaAttribute<int>(&canMapHostMemory,
|
getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
|
printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No");
|
||||||
printf(" Support host page-locked memory mapping: %s\n",
|
|
||||||
canMapHostMemory ? "Yes" : "No");
|
|
||||||
|
|
||||||
int concurrentKernels;
|
int concurrentKernels;
|
||||||
getCudaAttribute<int>(&concurrentKernels,
|
getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
|
printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No");
|
||||||
printf(" Concurrent kernel execution: %s\n",
|
|
||||||
concurrentKernels ? "Yes" : "No");
|
|
||||||
|
|
||||||
int surfaceAlignment;
|
int surfaceAlignment;
|
||||||
getCudaAttribute<int>(&surfaceAlignment,
|
getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
|
printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No");
|
||||||
printf(" Alignment requirement for Surfaces: %s\n",
|
|
||||||
surfaceAlignment ? "Yes" : "No");
|
|
||||||
|
|
||||||
int eccEnabled;
|
int eccEnabled;
|
||||||
getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
|
getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
|
||||||
printf(" Device has ECC support: %s\n",
|
printf(" Device has ECC support: %s\n", eccEnabled ? "Enabled" : "Disabled");
|
||||||
eccEnabled ? "Enabled" : "Disabled");
|
|
||||||
|
|
||||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||||
int tccDriver;
|
int tccDriver;
|
||||||
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
|
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
|
||||||
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
|
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
|
||||||
tccDriver ? "TCC (Tesla Compute Cluster Driver)"
|
tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
|
||||||
: "WDDM (Windows Display Driver Model)");
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int unifiedAddressing;
|
int unifiedAddressing;
|
||||||
getCudaAttribute<int>(&unifiedAddressing,
|
getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
|
printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No");
|
||||||
printf(" Device supports Unified Addressing (UVA): %s\n",
|
|
||||||
unifiedAddressing ? "Yes" : "No");
|
|
||||||
|
|
||||||
int managedMemory;
|
int managedMemory;
|
||||||
getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
|
getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
|
||||||
dev);
|
printf(" Device supports Managed Memory: %s\n", managedMemory ? "Yes" : "No");
|
||||||
printf(" Device supports Managed Memory: %s\n",
|
|
||||||
managedMemory ? "Yes" : "No");
|
|
||||||
|
|
||||||
int computePreemption;
|
int computePreemption;
|
||||||
getCudaAttribute<int>(&computePreemption,
|
getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
|
printf(" Device supports Compute Preemption: %s\n", computePreemption ? "Yes" : "No");
|
||||||
dev);
|
|
||||||
printf(" Device supports Compute Preemption: %s\n",
|
|
||||||
computePreemption ? "Yes" : "No");
|
|
||||||
|
|
||||||
int cooperativeLaunch;
|
int cooperativeLaunch;
|
||||||
getCudaAttribute<int>(&cooperativeLaunch,
|
getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
|
printf(" Supports Cooperative Kernel Launch: %s\n", cooperativeLaunch ? "Yes" : "No");
|
||||||
printf(" Supports Cooperative Kernel Launch: %s\n",
|
|
||||||
cooperativeLaunch ? "Yes" : "No");
|
|
||||||
|
|
||||||
int cooperativeMultiDevLaunch;
|
int cooperativeMultiDevLaunch;
|
||||||
getCudaAttribute<int>(&cooperativeMultiDevLaunch,
|
getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
|
||||||
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
|
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
|
||||||
dev);
|
|
||||||
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
|
|
||||||
cooperativeMultiDevLaunch ? "Yes" : "No");
|
|
||||||
|
|
||||||
int pciDomainID, pciBusID, pciDeviceID;
|
int pciDomainID, pciBusID, pciDeviceID;
|
||||||
getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
|
getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
|
||||||
getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
|
getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
|
||||||
getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
|
getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
|
||||||
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
|
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
|
||||||
pciDomainID, pciBusID, pciDeviceID);
|
|
||||||
|
|
||||||
const char *sComputeMode[] = {
|
const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
|
||||||
"Default (multiple host threads can use ::cudaSetDevice() with device "
|
|
||||||
"simultaneously)",
|
"simultaneously)",
|
||||||
"Exclusive (only one host thread in one process is able to use "
|
"Exclusive (only one host thread in one process is able to use "
|
||||||
"::cudaSetDevice() with this device)",
|
"::cudaSetDevice() with this device)",
|
||||||
@ -323,7 +260,8 @@ int main(int argc, char **argv) {
|
|||||||
"device)",
|
"device)",
|
||||||
"Exclusive Process (many threads in one process is able to use "
|
"Exclusive Process (many threads in one process is able to use "
|
||||||
"::cudaSetDevice() with this device)",
|
"::cudaSetDevice() with this device)",
|
||||||
"Unknown", NULL};
|
"Unknown",
|
||||||
|
NULL};
|
||||||
|
|
||||||
int computeMode;
|
int computeMode;
|
||||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
||||||
@ -338,10 +276,8 @@ int main(int argc, char **argv) {
|
|||||||
int tccDriver = 0;
|
int tccDriver = 0;
|
||||||
|
|
||||||
for (int i = 0; i < deviceCount; i++) {
|
for (int i = 0; i < deviceCount; i++) {
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
|
||||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
|
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
|
||||||
checkCudaErrors(cuDeviceGetAttribute(
|
|
||||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
|
|
||||||
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
|
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
|
||||||
|
|
||||||
// Only boards based on Fermi or later can support P2P
|
// Only boards based on Fermi or later can support P2P
|
||||||
@ -367,14 +303,15 @@ int main(int argc, char **argv) {
|
|||||||
if (gpuid[i] == gpuid[j]) {
|
if (gpuid[i] == gpuid[j]) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
checkCudaErrors(
|
checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
|
||||||
cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
|
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
|
checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
|
||||||
checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
|
checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
|
||||||
printf(
|
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
|
||||||
"> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
|
|
||||||
"%s\n",
|
"%s\n",
|
||||||
deviceName0, gpuid[i], deviceName1, gpuid[j],
|
deviceName0,
|
||||||
|
gpuid[i],
|
||||||
|
deviceName1,
|
||||||
|
gpuid[j],
|
||||||
can_access_peer ? "Yes" : "No");
|
can_access_peer ? "Yes" : "No");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
|
|||||||
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -37,32 +37,30 @@
|
|||||||
#include <helper_cuda.h>
|
#include <helper_cuda.h>
|
||||||
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
#include <helper_functions.h> // helper for shared that are common to CUDA Samples
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
int deviceCount = 0;
|
int deviceCount = 0;
|
||||||
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
|
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
|
||||||
|
|
||||||
// Enumerates Device <-> Device links
|
// Enumerates Device <-> Device links
|
||||||
for (int device1 = 0; device1 < deviceCount; device1++) {
|
for (int device1 = 0; device1 < deviceCount; device1++) {
|
||||||
for (int device2 = 0; device2 < deviceCount; device2++) {
|
for (int device2 = 0; device2 < deviceCount; device2++) {
|
||||||
if (device1 == device2) continue;
|
if (device1 == device2)
|
||||||
|
continue;
|
||||||
|
|
||||||
int perfRank = 0;
|
int perfRank = 0;
|
||||||
int atomicSupported = 0;
|
int atomicSupported = 0;
|
||||||
int accessSupported = 0;
|
int accessSupported = 0;
|
||||||
|
|
||||||
checkCudaErrors(cudaDeviceGetP2PAttribute(
|
checkCudaErrors(
|
||||||
&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
|
cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
|
||||||
checkCudaErrors(cudaDeviceGetP2PAttribute(
|
checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
|
||||||
&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
|
checkCudaErrors(
|
||||||
checkCudaErrors(cudaDeviceGetP2PAttribute(
|
cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));
|
||||||
&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
|
|
||||||
device2));
|
|
||||||
|
|
||||||
if (accessSupported) {
|
if (accessSupported) {
|
||||||
std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":"
|
std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
|
||||||
<< std::endl;
|
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
|
||||||
std::cout << " * Atomic Supported: "
|
|
||||||
<< (atomicSupported ? "yes" : "no") << std::endl;
|
|
||||||
std::cout << " * Perf Rank: " << perfRank << std::endl;
|
std::cout << " * Perf Rank: " << perfRank << std::endl;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -71,11 +69,9 @@ int main(int argc, char **argv) {
|
|||||||
// Enumerates Device <-> Host links
|
// Enumerates Device <-> Host links
|
||||||
for (int device = 0; device < deviceCount; device++) {
|
for (int device = 0; device < deviceCount; device++) {
|
||||||
int atomicSupported = 0;
|
int atomicSupported = 0;
|
||||||
checkCudaErrors(cudaDeviceGetAttribute(
|
checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
|
||||||
&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
|
|
||||||
std::cout << "GPU" << device << " <-> CPU:" << std::endl;
|
std::cout << "GPU" << device << " <-> CPU:" << std::endl;
|
||||||
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no")
|
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
|
||||||
<< std::endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|||||||
@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
|
|||||||
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
Make sure the dependencies mentioned in [Dependencies]() section above are installed.
|
||||||
|
|
||||||
## References (for more details)
|
## References (for more details)
|
||||||
|
|
||||||
|
|||||||
@ -29,12 +29,14 @@
|
|||||||
// DESCRIPTION: Simple CUDA consumer rendering sample app
|
// DESCRIPTION: Simple CUDA consumer rendering sample app
|
||||||
//
|
//
|
||||||
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include "cuda_consumer.h"
|
#include "cuda_consumer.h"
|
||||||
#include "eglstrm_common.h"
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "eglstrm_common.h"
|
||||||
|
|
||||||
#if defined(EXTENSION_LIST)
|
#if defined(EXTENSION_LIST)
|
||||||
EXTENSION_LIST(EXTLST_EXTERN)
|
EXTENSION_LIST(EXTLST_EXTERN)
|
||||||
#endif
|
#endif
|
||||||
@ -47,19 +49,22 @@ static int count_rel = 0;
|
|||||||
static double rel_time[25000] = {0}, total_time_rel = 0;
|
static double rel_time[25000] = {0}, total_time_rel = 0;
|
||||||
|
|
||||||
void acquireApiStat(void);
|
void acquireApiStat(void);
|
||||||
void acquireApiStat(void) {
|
void acquireApiStat(void)
|
||||||
|
{
|
||||||
int i = 0;
|
int i = 0;
|
||||||
double min = 10000000, max = 0;
|
double min = 10000000, max = 0;
|
||||||
double average_launch_time = 0, standard_deviation = 0;
|
double average_launch_time = 0, standard_deviation = 0;
|
||||||
if (count_acq == 0) return;
|
if (count_acq == 0)
|
||||||
|
return;
|
||||||
// lets compute the standard deviation
|
// lets compute the standard deviation
|
||||||
min = max = acquire_time[1];
|
min = max = acquire_time[1];
|
||||||
average_launch_time = (total_time_acq - acquire_time[0]) / count_acq;
|
average_launch_time = (total_time_acq - acquire_time[0]) / count_acq;
|
||||||
for (i = 1; i < count_acq; i++) {
|
for (i = 1; i < count_acq; i++) {
|
||||||
standard_deviation += (acquire_time[i] - average_launch_time) *
|
standard_deviation += (acquire_time[i] - average_launch_time) * (acquire_time[i] - average_launch_time);
|
||||||
(acquire_time[i] - average_launch_time);
|
if (acquire_time[i] < min)
|
||||||
if (acquire_time[i] < min) min = acquire_time[i];
|
min = acquire_time[i];
|
||||||
if (acquire_time[i] > max) max = acquire_time[i];
|
if (acquire_time[i] > max)
|
||||||
|
max = acquire_time[i];
|
||||||
}
|
}
|
||||||
standard_deviation = sqrt(standard_deviation / count_acq);
|
standard_deviation = sqrt(standard_deviation / count_acq);
|
||||||
printf("acquire Avg: %lf\n", average_launch_time);
|
printf("acquire Avg: %lf\n", average_launch_time);
|
||||||
@ -70,10 +75,11 @@ void acquireApiStat(void) {
|
|||||||
min = max = rel_time[1];
|
min = max = rel_time[1];
|
||||||
average_launch_time = (total_time_rel - rel_time[0]) / count_rel;
|
average_launch_time = (total_time_rel - rel_time[0]) / count_rel;
|
||||||
for (i = 1; i < count_rel; i++) {
|
for (i = 1; i < count_rel; i++) {
|
||||||
standard_deviation += (rel_time[i] - average_launch_time) *
|
standard_deviation += (rel_time[i] - average_launch_time) * (rel_time[i] - average_launch_time);
|
||||||
(rel_time[i] - average_launch_time);
|
if (rel_time[i] < min)
|
||||||
if (rel_time[i] < min) min = rel_time[i];
|
min = rel_time[i];
|
||||||
if (rel_time[i] > max) max = rel_time[i];
|
if (rel_time[i] > max)
|
||||||
|
max = rel_time[i];
|
||||||
}
|
}
|
||||||
standard_deviation = sqrt(standard_deviation / count_rel);
|
standard_deviation = sqrt(standard_deviation / count_rel);
|
||||||
printf("release Avg: %lf\n", average_launch_time);
|
printf("release Avg: %lf\n", average_launch_time);
|
||||||
@ -81,8 +87,8 @@ void acquireApiStat(void) {
|
|||||||
printf("release min: %lf\n", min);
|
printf("release min: %lf\n", min);
|
||||||
printf("release max: %lf\n", max);
|
printf("release max: %lf\n", max);
|
||||||
}
|
}
|
||||||
CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
|
CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber)
|
||||||
int frameNumber) {
|
{
|
||||||
CUresult cuStatus = CUDA_SUCCESS;
|
CUresult cuStatus = CUDA_SUCCESS;
|
||||||
CUeglFrame cudaEgl;
|
CUeglFrame cudaEgl;
|
||||||
struct timespec start, end;
|
struct timespec start, end;
|
||||||
@ -95,8 +101,7 @@ CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream,
|
if (!eglQueryStreamKHR(cudaConsumer->eglDisplay, cudaConsumer->eglStream, EGL_STREAM_STATE_KHR, &streamState)) {
|
||||||
EGL_STREAM_STATE_KHR, &streamState)) {
|
|
||||||
printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
|
printf("Cuda Consumer: eglQueryStreamKHR EGL_STREAM_STATE_KHR failed\n");
|
||||||
cuStatus = CUDA_ERROR_UNKNOWN;
|
cuStatus = CUDA_ERROR_UNKNOWN;
|
||||||
goto done;
|
goto done;
|
||||||
@ -115,33 +120,35 @@ CUresult cudaConsumerAcquireFrame(test_cuda_consumer_s *cudaConsumer,
|
|||||||
getTime(&start);
|
getTime(&start);
|
||||||
}
|
}
|
||||||
cuStatus =
|
cuStatus =
|
||||||
cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource,
|
cuEGLStreamConsumerAcquireFrame(&(cudaConsumer->cudaConn), &cudaResource, &cudaConsumer->consCudaStream, 16000);
|
||||||
&cudaConsumer->consCudaStream, 16000);
|
|
||||||
if (cudaConsumer->profileAPI) {
|
if (cudaConsumer->profileAPI) {
|
||||||
getTime(&end);
|
getTime(&end);
|
||||||
curTime = TIME_DIFF(end, start);
|
curTime = TIME_DIFF(end, start);
|
||||||
acquire_time[count_acq++] = curTime;
|
acquire_time[count_acq++] = curTime;
|
||||||
if (count_acq == 25000) count_acq = 0;
|
if (count_acq == 25000)
|
||||||
|
count_acq = 0;
|
||||||
total_time_acq += curTime;
|
total_time_acq += curTime;
|
||||||
}
|
}
|
||||||
if (cuStatus == CUDA_SUCCESS) {
|
if (cuStatus == CUDA_SUCCESS) {
|
||||||
CUdeviceptr pDevPtr = 0;
|
CUdeviceptr pDevPtr = 0;
|
||||||
cudaError_t err;
|
cudaError_t err;
|
||||||
|
|
||||||
cuStatus =
|
cuStatus = cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
|
||||||
cuGraphicsResourceGetMappedEglFrame(&cudaEgl, cudaResource, 0, 0);
|
|
||||||
if (cuStatus != CUDA_SUCCESS) {
|
if (cuStatus != CUDA_SUCCESS) {
|
||||||
printf("Cuda get resource failed with %d\n", cuStatus);
|
printf("Cuda get resource failed with %d\n", cuStatus);
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
|
pDevPtr = (CUdeviceptr)cudaEgl.frame.pPitch[0];
|
||||||
|
|
||||||
err = cudaConsumer_filter(cudaConsumer->consCudaStream, (char *)pDevPtr,
|
err = cudaConsumer_filter(cudaConsumer->consCudaStream,
|
||||||
WIDTH * 4, HEIGHT, PROD_DATA + frameNumber,
|
(char *)pDevPtr,
|
||||||
CONS_DATA + frameNumber, frameNumber);
|
WIDTH * 4,
|
||||||
|
HEIGHT,
|
||||||
|
PROD_DATA + frameNumber,
|
||||||
|
CONS_DATA + frameNumber,
|
||||||
|
frameNumber);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
printf("Cuda Consumer: kernel failed with: %s\n",
|
printf("Cuda Consumer: kernel failed with: %s\n", cudaGetErrorString(err));
|
||||||
cudaGetErrorString(err));
|
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -150,8 +157,8 @@ done:
|
|||||||
return cuStatus;
|
return cuStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer,
|
CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer, int frameNumber)
|
||||||
int frameNumber) {
|
{
|
||||||
CUresult cuStatus = CUDA_SUCCESS;
|
CUresult cuStatus = CUDA_SUCCESS;
|
||||||
struct timespec start, end;
|
struct timespec start, end;
|
||||||
double curTime;
|
double curTime;
|
||||||
@ -163,13 +170,13 @@ CUresult cudaConsumerReleaseFrame(test_cuda_consumer_s *cudaConsumer,
|
|||||||
if (cudaConsumer->profileAPI) {
|
if (cudaConsumer->profileAPI) {
|
||||||
getTime(&start);
|
getTime(&start);
|
||||||
}
|
}
|
||||||
cuStatus = cuEGLStreamConsumerReleaseFrame(
|
cuStatus = cuEGLStreamConsumerReleaseFrame(&cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
|
||||||
&cudaConsumer->cudaConn, cudaResource, &cudaConsumer->consCudaStream);
|
|
||||||
if (cudaConsumer->profileAPI) {
|
if (cudaConsumer->profileAPI) {
|
||||||
getTime(&end);
|
getTime(&end);
|
||||||
curTime = TIME_DIFF(end, start);
|
curTime = TIME_DIFF(end, start);
|
||||||
rel_time[count_rel++] = curTime;
|
rel_time[count_rel++] = curTime;
|
||||||
if (count_rel == 25000) count_rel = 0;
|
if (count_rel == 25000)
|
||||||
|
count_rel = 0;
|
||||||
total_time_rel += curTime;
|
total_time_rel += curTime;
|
||||||
}
|
}
|
||||||
if (cuStatus != CUDA_SUCCESS) {
|
if (cuStatus != CUDA_SUCCESS) {
|
||||||
@ -181,7 +188,8 @@ done:
|
|||||||
return cuStatus;
|
return cuStatus;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
|
CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer)
|
||||||
|
{
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CUresult status = CUDA_SUCCESS;
|
CUresult status = CUDA_SUCCESS;
|
||||||
|
|
||||||
@ -190,34 +198,31 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CUDA_SUCCESS !=
|
if (CUDA_SUCCESS != (status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
|
||||||
(status = cuDeviceGet(&device, cudaConsumer->cudaDevId))) {
|
|
||||||
printf("failed to get CUDA device\n");
|
printf("failed to get CUDA device\n");
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CUDA_SUCCESS !=
|
if (CUDA_SUCCESS != (status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
|
||||||
(status = cuCtxCreate(&cudaConsumer->context, 0, device))) {
|
|
||||||
printf("failed to create CUDA context\n");
|
printf("failed to create CUDA context\n");
|
||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
int major = 0, minor = 0;
|
int major = 0, minor = 0;
|
||||||
char deviceName[256];
|
char deviceName[256];
|
||||||
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
|
||||||
device);
|
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device);
|
||||||
cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
|
||||||
device);
|
|
||||||
cuDeviceGetName(deviceName, 256, device);
|
cuDeviceGetName(deviceName, 256, device);
|
||||||
printf(
|
printf("CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
|
||||||
"CUDA Consumer on GPU Device %d: \"%s\" with compute capability "
|
|
||||||
"%d.%d\n\n",
|
"%d.%d\n\n",
|
||||||
device, deviceName, major, minor);
|
device,
|
||||||
|
deviceName,
|
||||||
|
major,
|
||||||
|
minor);
|
||||||
|
|
||||||
cuCtxPopCurrent(&cudaConsumer->context);
|
cuCtxPopCurrent(&cudaConsumer->context);
|
||||||
if (major < 6) {
|
if (major < 6) {
|
||||||
printf(
|
printf("EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. "
|
||||||
"EGLStream_CUDA_CrossGPU requires SM 6.0 or higher arch GPU. "
|
|
||||||
"Exiting...\n");
|
"Exiting...\n");
|
||||||
exit(2); // EXIT_WAIVED
|
exit(2); // EXIT_WAIVED
|
||||||
}
|
}
|
||||||
@ -225,8 +230,8 @@ CUresult cudaDeviceCreateConsumer(test_cuda_consumer_s *cudaConsumer) {
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer,
|
CUresult cuda_consumer_init(test_cuda_consumer_s *cudaConsumer, TestArgs *args)
|
||||||
TestArgs *args) {
|
{
|
||||||
CUresult status = CUDA_SUCCESS;
|
CUresult status = CUDA_SUCCESS;
|
||||||
int bufferSize;
|
int bufferSize;
|
||||||
|
|
||||||
@ -250,7 +255,8 @@ done:
|
|||||||
return status;
|
return status;
|
||||||
}
|
}
|
||||||
|
|
||||||
CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer) {
|
CUresult cuda_consumer_Deinit(test_cuda_consumer_s *cudaConsumer)
|
||||||
|
{
|
||||||
if (cudaConsumer->pCudaCopyMem) {
|
if (cudaConsumer->pCudaCopyMem) {
|
||||||
free(cudaConsumer->pCudaCopyMem);
|
free(cudaConsumer->pCudaCopyMem);
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user