mirror of
https://github.com/NVIDIA/cuda-samples.git
synced 2025-07-09 10:30:31 +08:00
Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks.
This commit is contained in:
parent
2cd58fbc9a
commit
ceab6e8bcc
49
.clang-format
Normal file
49
.clang-format
Normal file
@ -0,0 +1,49 @@
|
||||
---
|
||||
AccessModifierOffset: -4
|
||||
AlignAfterOpenBracket: Align
|
||||
AlignConsecutiveAssignments: Consecutive
|
||||
AlignConsecutiveDeclarations: Consecutive
|
||||
AlignConsecutiveMacros: Consecutive
|
||||
AlignEscapedNewlines: Left
|
||||
AlignOperands: AlignAfterOperator
|
||||
AlignTrailingComments: true
|
||||
AllowAllParametersOfDeclarationOnNextLine: false
|
||||
BinPackArguments: false
|
||||
BinPackParameters: false
|
||||
BraceWrapping:
|
||||
AfterClass: true
|
||||
AfterControlStatement: false
|
||||
AfterExternBlock: true
|
||||
AfterFunction: true
|
||||
AfterStruct: true
|
||||
AfterUnion: true
|
||||
BeforeCatch: true
|
||||
BeforeElse: true
|
||||
IndentBraces: false
|
||||
BreakBeforeBraces: Custom
|
||||
BreakBeforeConceptDeclarations: true
|
||||
BreakBeforeBinaryOperators: NonAssignment
|
||||
BreakBeforeTernaryOperators: true
|
||||
BreakConstructorInitializers: BeforeComma
|
||||
BreakInheritanceList: BeforeComma
|
||||
ColumnLimit: 120
|
||||
DerivePointerAlignment: false
|
||||
FixNamespaceComments: true
|
||||
IncludeCategories:
|
||||
- Regex: '^<.*>'
|
||||
Priority: 1
|
||||
- Regex: '^".*"'
|
||||
Priority: 2
|
||||
SortIncludes: true
|
||||
IncludeBlocks: Regroup
|
||||
IndentWidth: 4
|
||||
MaxEmptyLinesToKeep: 2
|
||||
PointerAlignment: Right
|
||||
SortUsingDeclarations: true
|
||||
SpaceAfterCStyleCast: false
|
||||
SpaceBeforeAssignmentOperators: true
|
||||
SpaceBeforeParens: ControlStatements
|
||||
Standard: c++17
|
||||
TabWidth: 4
|
||||
UseTab: Never
|
||||
...
|
100
.pre-commit-config.yaml
Normal file
100
.pre-commit-config.yaml
Normal file
@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2024, NVIDIA CORPORATION.
|
||||
ci:
|
||||
autofix_commit_msg: |
|
||||
[pre-commit.ci] auto code formatting
|
||||
autofix_prs: false
|
||||
autoupdate_branch: ''
|
||||
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
|
||||
autoupdate_schedule: quarterly
|
||||
skip: []
|
||||
submodules: false
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: end-of-file-fixer
|
||||
exclude: |
|
||||
(?x)^(
|
||||
.*\.raw$|
|
||||
.*\.bin$|
|
||||
.*\.dat$|
|
||||
.*\.nv12$|
|
||||
data/.*|
|
||||
Common/.*
|
||||
)
|
||||
files: |
|
||||
(?x)^(
|
||||
.*\.txt$|
|
||||
.*\.md$|
|
||||
.*\.cpp$|
|
||||
.*\.cxx$|
|
||||
.*\.hpp$|
|
||||
.*\.h$|
|
||||
.*\.cu$|
|
||||
.*\.cuh$
|
||||
)
|
||||
- id: mixed-line-ending
|
||||
exclude: |
|
||||
(?x)^(
|
||||
.*\.raw$|
|
||||
.*\.bin$|
|
||||
.*\.dat$|
|
||||
.*\.nv12$|
|
||||
data/.*|
|
||||
Common/.*
|
||||
)
|
||||
files: |
|
||||
(?x)^(
|
||||
.*\.txt$|
|
||||
.*\.md$|
|
||||
.*\.cpp$|
|
||||
.*\.cxx$|
|
||||
.*\.hpp$|
|
||||
.*\.h$|
|
||||
.*\.cu$|
|
||||
.*\.cuh$
|
||||
)
|
||||
- id: trailing-whitespace
|
||||
exclude: |
|
||||
(?x)^(
|
||||
.*\.raw$|
|
||||
.*\.bin$|
|
||||
.*\.dat$|
|
||||
.*\.nv12$|
|
||||
data/.*|
|
||||
Common/.*
|
||||
)
|
||||
files: |
|
||||
(?x)^(
|
||||
.*\.txt$|
|
||||
.*\.md$|
|
||||
.*\.cpp$|
|
||||
.*\.cxx$|
|
||||
.*\.hpp$|
|
||||
.*\.h$|
|
||||
.*\.cu$|
|
||||
.*\.cuh$
|
||||
)
|
||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||
rev: v19.1.6
|
||||
hooks:
|
||||
- id: clang-format
|
||||
types_or: [file]
|
||||
files: |
|
||||
(?x)^(
|
||||
^.*\.c$|
|
||||
^.*\.cpp$|
|
||||
^.*\.cu$|
|
||||
^.*\.cuh$|
|
||||
^.*\.cxx$|
|
||||
^.*\.h$|
|
||||
^.*\.hpp$|
|
||||
^.*\.inl$|
|
||||
^.*\.mm$
|
||||
)
|
||||
exclude: |
|
||||
(?x)^(
|
||||
Common/.*
|
||||
)
|
||||
args: ["-fallback-style=none", "-style=file", "-i"]
|
@ -31,10 +31,10 @@
|
||||
*/
|
||||
|
||||
// system includes
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
#include <ctime>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#ifdef USE_PTHREADS
|
||||
#include <pthread.h>
|
||||
#else
|
||||
@ -51,291 +51,287 @@
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
|
||||
// functions
|
||||
void srand48(long seed) { srand((unsigned int)seed); }
|
||||
void srand48(long seed) { srand((unsigned int)seed); }
|
||||
double drand48() { return double(rand()) / RAND_MAX; }
|
||||
#endif
|
||||
|
||||
const char *sSDKname = "UnifiedMemoryStreams";
|
||||
|
||||
// simple task
|
||||
template <typename T>
|
||||
struct Task {
|
||||
unsigned int size, id;
|
||||
T *data;
|
||||
T *result;
|
||||
T *vector;
|
||||
template <typename T> struct Task
|
||||
{
|
||||
unsigned int size, id;
|
||||
T *data;
|
||||
T *result;
|
||||
T *vector;
|
||||
|
||||
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
|
||||
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
|
||||
// allocate unified memory -- the operation performed in this example will
|
||||
// be a DGEMV
|
||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
||||
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
~Task() {
|
||||
// ensure all memory is deallocated
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
checkCudaErrors(cudaFree(data));
|
||||
checkCudaErrors(cudaFree(result));
|
||||
checkCudaErrors(cudaFree(vector));
|
||||
}
|
||||
|
||||
void allocate(const unsigned int s, const unsigned int unique_id) {
|
||||
// allocate unified memory outside of constructor
|
||||
id = unique_id;
|
||||
size = s;
|
||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
||||
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
|
||||
// populate data with random elements
|
||||
for (unsigned int i = 0; i < size * size; i++) {
|
||||
data[i] = drand48();
|
||||
Task()
|
||||
: size(0)
|
||||
, id(0)
|
||||
, data(NULL)
|
||||
, result(NULL)
|
||||
, vector(NULL) {};
|
||||
Task(unsigned int s)
|
||||
: size(s)
|
||||
, id(0)
|
||||
, data(NULL)
|
||||
, result(NULL)
|
||||
{
|
||||
// allocate unified memory -- the operation performed in this example will
|
||||
// be a DGEMV
|
||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
||||
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < size; i++) {
|
||||
result[i] = 0.;
|
||||
vector[i] = drand48();
|
||||
~Task()
|
||||
{
|
||||
// ensure all memory is deallocated
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
checkCudaErrors(cudaFree(data));
|
||||
checkCudaErrors(cudaFree(result));
|
||||
checkCudaErrors(cudaFree(vector));
|
||||
}
|
||||
|
||||
void allocate(const unsigned int s, const unsigned int unique_id)
|
||||
{
|
||||
// allocate unified memory outside of constructor
|
||||
id = unique_id;
|
||||
size = s;
|
||||
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
|
||||
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
|
||||
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
|
||||
// populate data with random elements
|
||||
for (unsigned int i = 0; i < size * size; i++) {
|
||||
data[i] = drand48();
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < size; i++) {
|
||||
result[i] = 0.;
|
||||
vector[i] = drand48();
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef USE_PTHREADS
|
||||
struct threadData_t {
|
||||
int tid;
|
||||
Task<double> *TaskListPtr;
|
||||
cudaStream_t *streams;
|
||||
cublasHandle_t *handles;
|
||||
int taskSize;
|
||||
struct threadData_t
|
||||
{
|
||||
int tid;
|
||||
Task<double> *TaskListPtr;
|
||||
cudaStream_t *streams;
|
||||
cublasHandle_t *handles;
|
||||
int taskSize;
|
||||
};
|
||||
|
||||
typedef struct threadData_t threadData;
|
||||
#endif
|
||||
|
||||
// simple host dgemv: assume data is in row-major format and square
|
||||
template <typename T>
|
||||
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
|
||||
// rows
|
||||
for (int i = 0; i < n; i++) {
|
||||
result[i] *= beta;
|
||||
template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
|
||||
{
|
||||
// rows
|
||||
for (int i = 0; i < n; i++) {
|
||||
result[i] *= beta;
|
||||
|
||||
for (int j = 0; j < n; j++) {
|
||||
result[i] += A[i * n + j] * x[j];
|
||||
for (int j = 0; j < n; j++) {
|
||||
result[i] += A[i * n + j] * x[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// execute a single task on either host or device depending on size
|
||||
#ifdef USE_PTHREADS
|
||||
void *execute(void *inpArgs) {
|
||||
threadData *dataPtr = (threadData *)inpArgs;
|
||||
cudaStream_t *stream = dataPtr->streams;
|
||||
cublasHandle_t *handle = dataPtr->handles;
|
||||
int tid = dataPtr->tid;
|
||||
void *execute(void *inpArgs)
|
||||
{
|
||||
threadData *dataPtr = (threadData *)inpArgs;
|
||||
cudaStream_t *stream = dataPtr->streams;
|
||||
cublasHandle_t *handle = dataPtr->handles;
|
||||
int tid = dataPtr->tid;
|
||||
|
||||
for (int i = 0; i < dataPtr->taskSize; i++) {
|
||||
Task<double> &t = dataPtr->TaskListPtr[i];
|
||||
for (int i = 0; i < dataPtr->taskSize; i++) {
|
||||
Task<double> &t = dataPtr->TaskListPtr[i];
|
||||
|
||||
if (t.size < 100) {
|
||||
// perform on host
|
||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
|
||||
t.size);
|
||||
if (t.size < 100) {
|
||||
// perform on host
|
||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
|
||||
|
||||
// attach managed memory to a (dummy) stream to allow host access while
|
||||
// the device is running
|
||||
checkCudaErrors(
|
||||
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(
|
||||
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(
|
||||
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||
// call the host operation
|
||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||
} else {
|
||||
// perform on device
|
||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
|
||||
t.size);
|
||||
double one = 1.0;
|
||||
double zero = 0.0;
|
||||
// attach managed memory to a (dummy) stream to allow host access while
|
||||
// the device is running
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||
// call the host operation
|
||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||
}
|
||||
else {
|
||||
// perform on device
|
||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
|
||||
double one = 1.0;
|
||||
double zero = 0.0;
|
||||
|
||||
// attach managed memory to my stream
|
||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
|
||||
cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
|
||||
cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
|
||||
cudaMemAttachSingle));
|
||||
// call the device operation
|
||||
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
|
||||
&one, t.data, t.size, t.vector, 1, &zero,
|
||||
t.result, 1));
|
||||
// attach managed memory to my stream
|
||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
|
||||
// call the device operation
|
||||
checkCudaErrors(cublasDgemv(
|
||||
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pthread_exit(NULL);
|
||||
pthread_exit(NULL);
|
||||
}
|
||||
#else
|
||||
template <typename T>
|
||||
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
|
||||
int tid) {
|
||||
if (t.size < 100) {
|
||||
// perform on host
|
||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
|
||||
t.size);
|
||||
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
|
||||
{
|
||||
if (t.size < 100) {
|
||||
// perform on host
|
||||
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
|
||||
|
||||
// attach managed memory to a (dummy) stream to allow host access while the
|
||||
// device is running
|
||||
checkCudaErrors(
|
||||
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(
|
||||
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(
|
||||
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||
// call the host operation
|
||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||
} else {
|
||||
// perform on device
|
||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
|
||||
t.size);
|
||||
double one = 1.0;
|
||||
double zero = 0.0;
|
||||
// attach managed memory to a (dummy) stream to allow host access while the
|
||||
// device is running
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
|
||||
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
|
||||
checkCudaErrors(cudaStreamSynchronize(stream[0]));
|
||||
// call the host operation
|
||||
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
|
||||
}
|
||||
else {
|
||||
// perform on device
|
||||
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
|
||||
double one = 1.0;
|
||||
double zero = 0.0;
|
||||
|
||||
// attach managed memory to my stream
|
||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
|
||||
cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
|
||||
cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
|
||||
cudaMemAttachSingle));
|
||||
// call the device operation
|
||||
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
|
||||
&one, t.data, t.size, t.vector, 1, &zero,
|
||||
t.result, 1));
|
||||
}
|
||||
// attach managed memory to my stream
|
||||
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
|
||||
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
|
||||
// call the device operation
|
||||
checkCudaErrors(cublasDgemv(
|
||||
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// populate a list of tasks with random sizes
|
||||
template <typename T>
|
||||
void initialise_tasks(std::vector<Task<T> > &TaskList) {
|
||||
for (unsigned int i = 0; i < TaskList.size(); i++) {
|
||||
// generate random size
|
||||
int size;
|
||||
size = std::max((int)(drand48() * 1000.0), 64);
|
||||
TaskList[i].allocate(size, i);
|
||||
}
|
||||
template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
|
||||
{
|
||||
for (unsigned int i = 0; i < TaskList.size(); i++) {
|
||||
// generate random size
|
||||
int size;
|
||||
size = std::max((int)(drand48() * 1000.0), 64);
|
||||
TaskList[i].allocate(size, i);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// set device
|
||||
cudaDeviceProp device_prop;
|
||||
int dev_id = findCudaDevice(argc, (const char **)argv);
|
||||
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// set device
|
||||
cudaDeviceProp device_prop;
|
||||
int dev_id = findCudaDevice(argc, (const char **)argv);
|
||||
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
|
||||
|
||||
if (!device_prop.managedMemory) {
|
||||
// This samples requires being run on a device that supports Unified Memory
|
||||
fprintf(stderr, "Unified Memory not supported on this device\n");
|
||||
if (!device_prop.managedMemory) {
|
||||
// This samples requires being run on a device that supports Unified Memory
|
||||
fprintf(stderr, "Unified Memory not supported on this device\n");
|
||||
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
if (device_prop.computeMode == cudaComputeModeProhibited) {
|
||||
// This sample requires being run with a default or process exclusive mode
|
||||
fprintf(stderr,
|
||||
"This sample requires a device in either default or process "
|
||||
"exclusive mode\n");
|
||||
if (device_prop.computeMode == cudaComputeModeProhibited) {
|
||||
// This sample requires being run with a default or process exclusive mode
|
||||
fprintf(stderr,
|
||||
"This sample requires a device in either default or process "
|
||||
"exclusive mode\n");
|
||||
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
// randomise task sizes
|
||||
int seed = (int)time(NULL);
|
||||
srand48(seed);
|
||||
// randomise task sizes
|
||||
int seed = (int)time(NULL);
|
||||
srand48(seed);
|
||||
|
||||
// set number of threads
|
||||
const int nthreads = 4;
|
||||
// set number of threads
|
||||
const int nthreads = 4;
|
||||
|
||||
// number of streams = number of threads
|
||||
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
|
||||
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
|
||||
// number of streams = number of threads
|
||||
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
|
||||
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
|
||||
|
||||
for (int i = 0; i < nthreads + 1; i++) {
|
||||
checkCudaErrors(cudaStreamCreate(&streams[i]));
|
||||
checkCudaErrors(cublasCreate(&handles[i]));
|
||||
}
|
||||
for (int i = 0; i < nthreads + 1; i++) {
|
||||
checkCudaErrors(cudaStreamCreate(&streams[i]));
|
||||
checkCudaErrors(cublasCreate(&handles[i]));
|
||||
}
|
||||
|
||||
// create list of N tasks
|
||||
unsigned int N = 40;
|
||||
std::vector<Task<double> > TaskList(N);
|
||||
initialise_tasks(TaskList);
|
||||
// create list of N tasks
|
||||
unsigned int N = 40;
|
||||
std::vector<Task<double>> TaskList(N);
|
||||
initialise_tasks(TaskList);
|
||||
|
||||
printf("Executing tasks on host / device\n");
|
||||
printf("Executing tasks on host / device\n");
|
||||
|
||||
// run through all tasks using threads and streams
|
||||
#ifdef USE_PTHREADS
|
||||
pthread_t threads[nthreads];
|
||||
threadData *InputToThreads = new threadData[nthreads];
|
||||
pthread_t threads[nthreads];
|
||||
threadData *InputToThreads = new threadData[nthreads];
|
||||
|
||||
for (int i = 0; i < nthreads; i++) {
|
||||
checkCudaErrors(cudaSetDevice(dev_id));
|
||||
InputToThreads[i].tid = i;
|
||||
InputToThreads[i].streams = streams;
|
||||
InputToThreads[i].handles = handles;
|
||||
for (int i = 0; i < nthreads; i++) {
|
||||
checkCudaErrors(cudaSetDevice(dev_id));
|
||||
InputToThreads[i].tid = i;
|
||||
InputToThreads[i].streams = streams;
|
||||
InputToThreads[i].handles = handles;
|
||||
|
||||
if ((TaskList.size() / nthreads) == 0) {
|
||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||
InputToThreads[i].TaskListPtr =
|
||||
&TaskList[i * (TaskList.size() / nthreads)];
|
||||
} else {
|
||||
if (i == nthreads - 1) {
|
||||
InputToThreads[i].taskSize =
|
||||
(TaskList.size() / nthreads) + (TaskList.size() % nthreads);
|
||||
InputToThreads[i].TaskListPtr =
|
||||
&TaskList[i * (TaskList.size() / nthreads) +
|
||||
(TaskList.size() % nthreads)];
|
||||
} else {
|
||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||
InputToThreads[i].TaskListPtr =
|
||||
&TaskList[i * (TaskList.size() / nthreads)];
|
||||
}
|
||||
if ((TaskList.size() / nthreads) == 0) {
|
||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
|
||||
}
|
||||
else {
|
||||
if (i == nthreads - 1) {
|
||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
|
||||
InputToThreads[i].TaskListPtr =
|
||||
&TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
|
||||
}
|
||||
else {
|
||||
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
|
||||
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
|
||||
}
|
||||
}
|
||||
|
||||
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
|
||||
}
|
||||
for (int i = 0; i < nthreads; i++) {
|
||||
pthread_join(threads[i], NULL);
|
||||
}
|
||||
|
||||
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
|
||||
}
|
||||
for (int i = 0; i < nthreads; i++) {
|
||||
pthread_join(threads[i], NULL);
|
||||
}
|
||||
#else
|
||||
omp_set_num_threads(nthreads);
|
||||
omp_set_num_threads(nthreads);
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int i = 0; i < TaskList.size(); i++) {
|
||||
checkCudaErrors(cudaSetDevice(dev_id));
|
||||
int tid = omp_get_thread_num();
|
||||
execute(TaskList[i], handles, streams, tid);
|
||||
}
|
||||
for (int i = 0; i < TaskList.size(); i++) {
|
||||
checkCudaErrors(cudaSetDevice(dev_id));
|
||||
int tid = omp_get_thread_num();
|
||||
execute(TaskList[i], handles, streams, tid);
|
||||
}
|
||||
#endif
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// Destroy CUDA Streams, cuBlas handles
|
||||
for (int i = 0; i < nthreads + 1; i++) {
|
||||
cudaStreamDestroy(streams[i]);
|
||||
cublasDestroy(handles[i]);
|
||||
}
|
||||
// Destroy CUDA Streams, cuBlas handles
|
||||
for (int i = 0; i < nthreads + 1; i++) {
|
||||
cudaStreamDestroy(streams[i]);
|
||||
cublasDestroy(handles[i]);
|
||||
}
|
||||
|
||||
// Free TaskList
|
||||
std::vector<Task<double> >().swap(TaskList);
|
||||
// Free TaskList
|
||||
std::vector<Task<double>>().swap(TaskList);
|
||||
|
||||
printf("All Done!\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
printf("All Done!\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
@ -38,105 +38,107 @@
|
||||
#include <stdio.h>
|
||||
|
||||
// includes CUDA Runtime
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// includes, project
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_functions.h> // helper utility functions
|
||||
#include <helper_functions.h> // helper utility functions
|
||||
|
||||
__global__ void increment_kernel(int *g_data, int inc_value) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
g_data[idx] = g_data[idx] + inc_value;
|
||||
__global__ void increment_kernel(int *g_data, int inc_value)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
g_data[idx] = g_data[idx] + inc_value;
|
||||
}
|
||||
|
||||
bool correct_output(int *data, const int n, const int x) {
|
||||
for (int i = 0; i < n; i++)
|
||||
if (data[i] != x) {
|
||||
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
||||
return false;
|
||||
bool correct_output(int *data, const int n, const int x)
|
||||
{
|
||||
for (int i = 0; i < n; i++)
|
||||
if (data[i] != x) {
|
||||
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int devID;
|
||||
cudaDeviceProp deviceProps;
|
||||
|
||||
printf("[%s] - Starting...\n", argv[0]);
|
||||
|
||||
// This will pick the best possible CUDA capable device
|
||||
devID = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
// get device name
|
||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||
printf("CUDA device [%s]\n", deviceProps.name);
|
||||
|
||||
int n = 16 * 1024 * 1024;
|
||||
int nbytes = n * sizeof(int);
|
||||
int value = 26;
|
||||
|
||||
// allocate host memory
|
||||
int *a = 0;
|
||||
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
||||
memset(a, 0, nbytes);
|
||||
|
||||
// allocate device memory
|
||||
int *d_a = 0;
|
||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
||||
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
|
||||
|
||||
// set kernel launch configuration
|
||||
dim3 threads = dim3(512, 1);
|
||||
dim3 blocks = dim3(n / threads.x, 1);
|
||||
|
||||
// create cuda event handles
|
||||
cudaEvent_t start, stop;
|
||||
checkCudaErrors(cudaEventCreate(&start));
|
||||
checkCudaErrors(cudaEventCreate(&stop));
|
||||
|
||||
StopWatchInterface *timer = NULL;
|
||||
sdkCreateTimer(&timer);
|
||||
sdkResetTimer(&timer);
|
||||
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
float gpu_time = 0.0f;
|
||||
|
||||
// asynchronously issue work to the GPU (all to stream 0)
|
||||
checkCudaErrors(cudaProfilerStart());
|
||||
sdkStartTimer(&timer);
|
||||
cudaEventRecord(start, 0);
|
||||
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
|
||||
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
|
||||
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
|
||||
cudaEventRecord(stop, 0);
|
||||
sdkStopTimer(&timer);
|
||||
checkCudaErrors(cudaProfilerStop());
|
||||
|
||||
// have CPU do some work while waiting for stage 1 to finish
|
||||
unsigned long int counter = 0;
|
||||
|
||||
while (cudaEventQuery(stop) == cudaErrorNotReady) {
|
||||
counter++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int devID;
|
||||
cudaDeviceProp deviceProps;
|
||||
|
||||
printf("[%s] - Starting...\n", argv[0]);
|
||||
|
||||
// This will pick the best possible CUDA capable device
|
||||
devID = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
// get device name
|
||||
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
|
||||
printf("CUDA device [%s]\n", deviceProps.name);
|
||||
|
||||
int n = 16 * 1024 * 1024;
|
||||
int nbytes = n * sizeof(int);
|
||||
int value = 26;
|
||||
|
||||
// allocate host memory
|
||||
int *a = 0;
|
||||
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
|
||||
memset(a, 0, nbytes);
|
||||
|
||||
// allocate device memory
|
||||
int *d_a = 0;
|
||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
|
||||
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
|
||||
|
||||
// set kernel launch configuration
|
||||
dim3 threads = dim3(512, 1);
|
||||
dim3 blocks = dim3(n / threads.x, 1);
|
||||
|
||||
// create cuda event handles
|
||||
cudaEvent_t start, stop;
|
||||
checkCudaErrors(cudaEventCreate(&start));
|
||||
checkCudaErrors(cudaEventCreate(&stop));
|
||||
|
||||
StopWatchInterface *timer = NULL;
|
||||
sdkCreateTimer(&timer);
|
||||
sdkResetTimer(&timer);
|
||||
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
float gpu_time = 0.0f;
|
||||
|
||||
// asynchronously issue work to the GPU (all to stream 0)
|
||||
checkCudaErrors(cudaProfilerStart());
|
||||
sdkStartTimer(&timer);
|
||||
cudaEventRecord(start, 0);
|
||||
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
|
||||
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
|
||||
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
|
||||
cudaEventRecord(stop, 0);
|
||||
sdkStopTimer(&timer);
|
||||
checkCudaErrors(cudaProfilerStop());
|
||||
|
||||
// have CPU do some work while waiting for stage 1 to finish
|
||||
unsigned long int counter = 0;
|
||||
|
||||
while (cudaEventQuery(stop) == cudaErrorNotReady) {
|
||||
counter++;
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
|
||||
|
||||
// print the cpu and gpu times
|
||||
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
||||
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
||||
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
|
||||
counter);
|
||||
|
||||
// check the output for correctness
|
||||
bool bFinalResults = correct_output(a, n, value);
|
||||
|
||||
// release resources
|
||||
checkCudaErrors(cudaEventDestroy(start));
|
||||
checkCudaErrors(cudaEventDestroy(stop));
|
||||
checkCudaErrors(cudaFreeHost(a));
|
||||
checkCudaErrors(cudaFree(d_a));
|
||||
|
||||
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
|
||||
|
||||
// print the cpu and gpu times
|
||||
printf("time spent executing by the GPU: %.2f\n", gpu_time);
|
||||
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
|
||||
printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
|
||||
|
||||
// check the output for correctness
|
||||
bool bFinalResults = correct_output(a, n, value);
|
||||
|
||||
// release resources
|
||||
checkCudaErrors(cudaEventDestroy(start));
|
||||
checkCudaErrors(cudaEventDestroy(stop));
|
||||
checkCudaErrors(cudaFreeHost(a));
|
||||
checkCudaErrors(cudaFree(d_a));
|
||||
|
||||
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
@ -48,43 +48,46 @@
|
||||
// This kernel computes a standard parallel reduction and evaluates the
|
||||
// time it takes to do that for each block. The timing results are stored
|
||||
// in device memory.
|
||||
__global__ static void timedReduction(const float *input, float *output,
|
||||
clock_t *timer) {
|
||||
// __shared__ float shared[2 * blockDim.x];
|
||||
extern __shared__ float shared[];
|
||||
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
|
||||
{
|
||||
// __shared__ float shared[2 * blockDim.x];
|
||||
extern __shared__ float shared[];
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = blockIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = blockIdx.x;
|
||||
|
||||
if (tid == 0) timer[bid] = clock();
|
||||
if (tid == 0)
|
||||
timer[bid] = clock();
|
||||
|
||||
// Copy input.
|
||||
shared[tid] = input[tid];
|
||||
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
||||
// Copy input.
|
||||
shared[tid] = input[tid];
|
||||
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
||||
|
||||
// Perform reduction to find minimum.
|
||||
for (int d = blockDim.x; d > 0; d /= 2) {
|
||||
__syncthreads();
|
||||
|
||||
if (tid < d) {
|
||||
float f0 = shared[tid];
|
||||
float f1 = shared[tid + d];
|
||||
|
||||
if (f1 < f0) {
|
||||
shared[tid] = f1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write result.
|
||||
if (tid == 0)
|
||||
output[bid] = shared[0];
|
||||
|
||||
// Perform reduction to find minimum.
|
||||
for (int d = blockDim.x; d > 0; d /= 2) {
|
||||
__syncthreads();
|
||||
|
||||
if (tid < d) {
|
||||
float f0 = shared[tid];
|
||||
float f1 = shared[tid + d];
|
||||
|
||||
if (f1 < f0) {
|
||||
shared[tid] = f1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write result.
|
||||
if (tid == 0) output[bid] = shared[0];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) timer[bid + gridDim.x] = clock();
|
||||
if (tid == 0)
|
||||
timer[bid + gridDim.x] = clock();
|
||||
}
|
||||
|
||||
#define NUM_BLOCKS 64
|
||||
#define NUM_BLOCKS 64
|
||||
#define NUM_THREADS 256
|
||||
|
||||
// It's interesting to change the number of blocks and the number of threads to
|
||||
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
|
||||
// the memory. With more than 32 the speed scales linearly.
|
||||
|
||||
// Start the main CUDA Sample here
|
||||
int main(int argc, char **argv) {
|
||||
printf("CUDA Clock sample\n");
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("CUDA Clock sample\n");
|
||||
|
||||
// This will pick the best possible CUDA capable device
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
// This will pick the best possible CUDA capable device
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
float *dinput = NULL;
|
||||
float *doutput = NULL;
|
||||
clock_t *dtimer = NULL;
|
||||
float *dinput = NULL;
|
||||
float *doutput = NULL;
|
||||
clock_t *dtimer = NULL;
|
||||
|
||||
clock_t timer[NUM_BLOCKS * 2];
|
||||
float input[NUM_THREADS * 2];
|
||||
clock_t timer[NUM_BLOCKS * 2];
|
||||
float input[NUM_THREADS * 2];
|
||||
|
||||
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
||||
input[i] = (float)i;
|
||||
}
|
||||
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
||||
input[i] = (float)i;
|
||||
}
|
||||
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
|
||||
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||
|
||||
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
|
||||
cudaMemcpyHostToDevice));
|
||||
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
|
||||
|
||||
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
|
||||
dinput, doutput, dtimer);
|
||||
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
|
||||
|
||||
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
|
||||
cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
|
||||
|
||||
checkCudaErrors(cudaFree(dinput));
|
||||
checkCudaErrors(cudaFree(doutput));
|
||||
checkCudaErrors(cudaFree(dtimer));
|
||||
checkCudaErrors(cudaFree(dinput));
|
||||
checkCudaErrors(cudaFree(doutput));
|
||||
checkCudaErrors(cudaFree(dtimer));
|
||||
|
||||
long double avgElapsedClocks = 0;
|
||||
long double avgElapsedClocks = 0;
|
||||
|
||||
for (int i = 0; i < NUM_BLOCKS; i++) {
|
||||
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
||||
}
|
||||
for (int i = 0; i < NUM_BLOCKS; i++) {
|
||||
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
||||
}
|
||||
|
||||
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
||||
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
||||
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
||||
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@ -34,12 +34,11 @@
|
||||
*/
|
||||
|
||||
// System includes
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <nvrtc_helper.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// helper functions and utilities to work with CUDA
|
||||
#include <helper_functions.h>
|
||||
@ -71,64 +70,68 @@
|
||||
|
||||
// Start the main CUDA Sample here
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("CUDA Clock sample\n");
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("CUDA Clock sample\n");
|
||||
|
||||
typedef long clock_t;
|
||||
typedef long clock_t;
|
||||
|
||||
clock_t timer[NUM_BLOCKS * 2];
|
||||
clock_t timer[NUM_BLOCKS * 2];
|
||||
|
||||
float input[NUM_THREADS * 2];
|
||||
float input[NUM_THREADS * 2];
|
||||
|
||||
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
||||
input[i] = (float)i;
|
||||
}
|
||||
for (int i = 0; i < NUM_THREADS * 2; i++) {
|
||||
input[i] = (float)i;
|
||||
}
|
||||
|
||||
char *cubin, *kernel_file;
|
||||
size_t cubinSize;
|
||||
char *cubin, *kernel_file;
|
||||
size_t cubinSize;
|
||||
|
||||
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
|
||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
||||
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
|
||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
|
||||
|
||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||
CUfunction kernel_addr;
|
||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||
CUfunction kernel_addr;
|
||||
|
||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
|
||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
|
||||
|
||||
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
|
||||
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
|
||||
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
|
||||
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
|
||||
|
||||
CUdeviceptr dinput, doutput, dtimer;
|
||||
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
|
||||
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
|
||||
CUdeviceptr dinput, doutput, dtimer;
|
||||
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
|
||||
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
|
||||
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
|
||||
|
||||
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
|
||||
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
|
||||
|
||||
checkCudaErrors(cuLaunchKernel(
|
||||
kernel_addr, cudaGridSize.x, cudaGridSize.y,
|
||||
cudaGridSize.z, /* grid dim */
|
||||
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
|
||||
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
|
||||
&arr[0], /* arguments */
|
||||
0));
|
||||
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||
cudaGridSize.x,
|
||||
cudaGridSize.y,
|
||||
cudaGridSize.z, /* grid dim */
|
||||
cudaBlockSize.x,
|
||||
cudaBlockSize.y,
|
||||
cudaBlockSize.z, /* block dim */
|
||||
sizeof(float) * 2 * NUM_THREADS,
|
||||
0, /* shared mem, stream */
|
||||
&arr[0], /* arguments */
|
||||
0));
|
||||
|
||||
checkCudaErrors(cuCtxSynchronize());
|
||||
checkCudaErrors(
|
||||
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||
checkCudaErrors(cuMemFree(dinput));
|
||||
checkCudaErrors(cuMemFree(doutput));
|
||||
checkCudaErrors(cuMemFree(dtimer));
|
||||
checkCudaErrors(cuCtxSynchronize());
|
||||
checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
|
||||
checkCudaErrors(cuMemFree(dinput));
|
||||
checkCudaErrors(cuMemFree(doutput));
|
||||
checkCudaErrors(cuMemFree(dtimer));
|
||||
|
||||
long double avgElapsedClocks = 0;
|
||||
long double avgElapsedClocks = 0;
|
||||
|
||||
for (int i = 0; i < NUM_BLOCKS; i++) {
|
||||
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
||||
}
|
||||
for (int i = 0; i < NUM_BLOCKS; i++) {
|
||||
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
|
||||
}
|
||||
|
||||
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
||||
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
||||
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
|
||||
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@ -37,38 +37,41 @@
|
||||
// time it takes to do that for each block. The timing results are stored
|
||||
// in device memory.
|
||||
|
||||
extern "C" __global__ void timedReduction(const float *input, float *output,
|
||||
clock_t *timer) {
|
||||
// __shared__ float shared[2 * blockDim.x];
|
||||
extern __shared__ float shared[];
|
||||
extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
|
||||
{
|
||||
// __shared__ float shared[2 * blockDim.x];
|
||||
extern __shared__ float shared[];
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = blockIdx.x;
|
||||
const int tid = threadIdx.x;
|
||||
const int bid = blockIdx.x;
|
||||
|
||||
if (tid == 0) timer[bid] = clock();
|
||||
if (tid == 0)
|
||||
timer[bid] = clock();
|
||||
|
||||
// Copy input.
|
||||
shared[tid] = input[tid];
|
||||
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
||||
// Copy input.
|
||||
shared[tid] = input[tid];
|
||||
shared[tid + blockDim.x] = input[tid + blockDim.x];
|
||||
|
||||
// Perform reduction to find minimum.
|
||||
for (int d = blockDim.x; d > 0; d /= 2) {
|
||||
__syncthreads();
|
||||
|
||||
if (tid < d) {
|
||||
float f0 = shared[tid];
|
||||
float f1 = shared[tid + d];
|
||||
|
||||
if (f1 < f0) {
|
||||
shared[tid] = f1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write result.
|
||||
if (tid == 0)
|
||||
output[bid] = shared[0];
|
||||
|
||||
// Perform reduction to find minimum.
|
||||
for (int d = blockDim.x; d > 0; d /= 2) {
|
||||
__syncthreads();
|
||||
|
||||
if (tid < d) {
|
||||
float f0 = shared[tid];
|
||||
float f1 = shared[tid + d];
|
||||
|
||||
if (f1 < f0) {
|
||||
shared[tid] = f1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Write result.
|
||||
if (tid == 0) output[bid] = shared[0];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (tid == 0) timer[bid + gridDim.x] = clock();
|
||||
if (tid == 0)
|
||||
timer[bid + gridDim.x] = clock();
|
||||
}
|
||||
|
@ -32,128 +32,125 @@
|
||||
|
||||
#include <helper_cuda.h>
|
||||
#include <omp.h>
|
||||
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
|
||||
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
|
||||
|
||||
using namespace std;
|
||||
|
||||
// a simple kernel that simply increments each array element by b
|
||||
__global__ void kernelAddConstant(int *g_a, const int b) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
g_a[idx] += b;
|
||||
__global__ void kernelAddConstant(int *g_a, const int b)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
g_a[idx] += b;
|
||||
}
|
||||
|
||||
// a predicate that checks whether each array element is set to its index plus b
|
||||
int correctResult(int *data, const int n, const int b) {
|
||||
for (int i = 0; i < n; i++)
|
||||
if (data[i] != i + b) return 0;
|
||||
int correctResult(int *data, const int n, const int b)
|
||||
{
|
||||
for (int i = 0; i < n; i++)
|
||||
if (data[i] != i + b)
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int num_gpus = 0; // number of CUDA GPUs
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
int num_gpus = 0; // number of CUDA GPUs
|
||||
|
||||
printf("%s Starting...\n\n", argv[0]);
|
||||
printf("%s Starting...\n\n", argv[0]);
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// determine the number of CUDA capable GPUs
|
||||
//
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// determine the number of CUDA capable GPUs
|
||||
//
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
|
||||
if (num_gpus < 1) {
|
||||
printf("no CUDA capable devices were detected\n");
|
||||
return 1;
|
||||
}
|
||||
if (num_gpus < 1) {
|
||||
printf("no CUDA capable devices were detected\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// display CPU and GPU configuration
|
||||
//
|
||||
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
|
||||
printf("number of CUDA devices:\t%d\n", num_gpus);
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// display CPU and GPU configuration
|
||||
//
|
||||
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
|
||||
printf("number of CUDA devices:\t%d\n", num_gpus);
|
||||
|
||||
for (int i = 0; i < num_gpus; i++) {
|
||||
cudaDeviceProp dprop;
|
||||
cudaGetDeviceProperties(&dprop, i);
|
||||
printf(" %d: %s\n", i, dprop.name);
|
||||
}
|
||||
for (int i = 0; i < num_gpus; i++) {
|
||||
cudaDeviceProp dprop;
|
||||
cudaGetDeviceProperties(&dprop, i);
|
||||
printf(" %d: %s\n", i, dprop.name);
|
||||
}
|
||||
|
||||
printf("---------------------------\n");
|
||||
printf("---------------------------\n");
|
||||
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// initialize data
|
||||
//
|
||||
unsigned int n = num_gpus * 8192;
|
||||
unsigned int nbytes = n * sizeof(int);
|
||||
int *a = 0; // pointer to data on the CPU
|
||||
int b = 3; // value by which the array is incremented
|
||||
a = (int *)malloc(nbytes);
|
||||
/////////////////////////////////////////////////////////////////
|
||||
// initialize data
|
||||
//
|
||||
unsigned int n = num_gpus * 8192;
|
||||
unsigned int nbytes = n * sizeof(int);
|
||||
int *a = 0; // pointer to data on the CPU
|
||||
int b = 3; // value by which the array is incremented
|
||||
a = (int *)malloc(nbytes);
|
||||
|
||||
if (0 == a) {
|
||||
printf("couldn't allocate CPU memory\n");
|
||||
return 1;
|
||||
}
|
||||
if (0 == a) {
|
||||
printf("couldn't allocate CPU memory\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (unsigned int i = 0; i < n; i++) a[i] = i;
|
||||
for (unsigned int i = 0; i < n; i++)
|
||||
a[i] = i;
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// run as many CPU threads as there are CUDA devices
|
||||
// each CPU thread controls a different device, processing its
|
||||
// portion of the data. It's possible to use more CPU threads
|
||||
// than there are CUDA devices, in which case several CPU
|
||||
// threads will be allocating resources and launching kernels
|
||||
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
|
||||
// Recall that all variables declared inside an "omp parallel" scope are
|
||||
// local to each CPU thread
|
||||
//
|
||||
omp_set_num_threads(
|
||||
num_gpus); // create as many CPU threads as there are CUDA devices
|
||||
////////////////////////////////////////////////////////////////
|
||||
// run as many CPU threads as there are CUDA devices
|
||||
// each CPU thread controls a different device, processing its
|
||||
// portion of the data. It's possible to use more CPU threads
|
||||
// than there are CUDA devices, in which case several CPU
|
||||
// threads will be allocating resources and launching kernels
|
||||
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
|
||||
// Recall that all variables declared inside an "omp parallel" scope are
|
||||
// local to each CPU thread
|
||||
//
|
||||
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
|
||||
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
|
||||
// are CUDA devices
|
||||
#pragma omp parallel
|
||||
{
|
||||
unsigned int cpu_thread_id = omp_get_thread_num();
|
||||
unsigned int num_cpu_threads = omp_get_num_threads();
|
||||
{
|
||||
unsigned int cpu_thread_id = omp_get_thread_num();
|
||||
unsigned int num_cpu_threads = omp_get_num_threads();
|
||||
|
||||
// set and check the CUDA device for this CPU thread
|
||||
int gpu_id = -1;
|
||||
checkCudaErrors(cudaSetDevice(
|
||||
cpu_thread_id %
|
||||
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
|
||||
checkCudaErrors(cudaGetDevice(&gpu_id));
|
||||
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
|
||||
num_cpu_threads, gpu_id);
|
||||
// set and check the CUDA device for this CPU thread
|
||||
int gpu_id = -1;
|
||||
checkCudaErrors(
|
||||
cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
|
||||
checkCudaErrors(cudaGetDevice(&gpu_id));
|
||||
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
|
||||
|
||||
int *d_a =
|
||||
0; // pointer to memory on the device associated with this CPU thread
|
||||
int *sub_a =
|
||||
a +
|
||||
cpu_thread_id * n /
|
||||
num_cpu_threads; // pointer to this CPU thread's portion of data
|
||||
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
|
||||
dim3 gpu_threads(128); // 128 threads per block
|
||||
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
|
||||
int *d_a = 0; // pointer to memory on the device associated with this CPU thread
|
||||
int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
|
||||
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
|
||||
dim3 gpu_threads(128); // 128 threads per block
|
||||
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
|
||||
|
||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
|
||||
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
|
||||
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
|
||||
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
|
||||
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
|
||||
checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
|
||||
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
|
||||
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(cudaFree(d_a));
|
||||
}
|
||||
printf("---------------------------\n");
|
||||
checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(cudaFree(d_a));
|
||||
}
|
||||
printf("---------------------------\n");
|
||||
|
||||
if (cudaSuccess != cudaGetLastError())
|
||||
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
|
||||
if (cudaSuccess != cudaGetLastError())
|
||||
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
|
||||
|
||||
////////////////////////////////////////////////////////////////
|
||||
// check the result
|
||||
//
|
||||
bool bResult = correctResult(a, n, b);
|
||||
////////////////////////////////////////////////////////////////
|
||||
// check the result
|
||||
//
|
||||
bool bResult = correctResult(a, n, b);
|
||||
|
||||
if (a) free(a); // free CPU memory
|
||||
if (a)
|
||||
free(a); // free CPU memory
|
||||
|
||||
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
@ -25,191 +25,188 @@
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "cuda_fp16.h"
|
||||
#include "helper_cuda.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <ctime>
|
||||
|
||||
#define NUM_OF_BLOCKS 128
|
||||
#include "cuda_fp16.h"
|
||||
#include "helper_cuda.h"
|
||||
|
||||
#define NUM_OF_BLOCKS 128
|
||||
#define NUM_OF_THREADS 128
|
||||
|
||||
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
|
||||
if (threadIdx.x < 64)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 32)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 16)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 8)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 4)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 2)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 1)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
|
||||
__syncthreads();
|
||||
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
|
||||
{
|
||||
if (threadIdx.x < 64)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 32)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 16)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 8)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 4)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 2)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 1)
|
||||
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
|
||||
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
|
||||
__syncthreads();
|
||||
__forceinline__ __device__ void reduceInShared_native(half2 *const v)
|
||||
{
|
||||
if (threadIdx.x < 64)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 32)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 16)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 8)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 4)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 2)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
|
||||
__syncthreads();
|
||||
if (threadIdx.x < 1)
|
||||
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
__global__ void scalarProductKernel_intrinsics(half2 const *const a,
|
||||
half2 const *const b,
|
||||
float *const results,
|
||||
size_t const size) {
|
||||
const int stride = gridDim.x * blockDim.x;
|
||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||
__global__ void
|
||||
scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
|
||||
{
|
||||
const int stride = gridDim.x * blockDim.x;
|
||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||
|
||||
shArray[threadIdx.x] = __float2half2_rn(0.f);
|
||||
half2 value = __float2half2_rn(0.f);
|
||||
shArray[threadIdx.x] = __float2half2_rn(0.f);
|
||||
half2 value = __float2half2_rn(0.f);
|
||||
|
||||
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
||||
value = __hfma2(a[i], b[i], value);
|
||||
}
|
||||
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
||||
value = __hfma2(a[i], b[i], value);
|
||||
}
|
||||
|
||||
shArray[threadIdx.x] = value;
|
||||
__syncthreads();
|
||||
reduceInShared_intrinsics(shArray);
|
||||
shArray[threadIdx.x] = value;
|
||||
__syncthreads();
|
||||
reduceInShared_intrinsics(shArray);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
half2 result = shArray[0];
|
||||
float f_result = __low2float(result) + __high2float(result);
|
||||
results[blockIdx.x] = f_result;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
half2 result = shArray[0];
|
||||
float f_result = __low2float(result) + __high2float(result);
|
||||
results[blockIdx.x] = f_result;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void scalarProductKernel_native(half2 const *const a,
|
||||
half2 const *const b,
|
||||
float *const results,
|
||||
size_t const size) {
|
||||
const int stride = gridDim.x * blockDim.x;
|
||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||
__global__ void
|
||||
scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
|
||||
{
|
||||
const int stride = gridDim.x * blockDim.x;
|
||||
__shared__ half2 shArray[NUM_OF_THREADS];
|
||||
|
||||
half2 value(0.f, 0.f);
|
||||
shArray[threadIdx.x] = value;
|
||||
half2 value(0.f, 0.f);
|
||||
shArray[threadIdx.x] = value;
|
||||
|
||||
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
||||
value = a[i] * b[i] + value;
|
||||
}
|
||||
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
|
||||
value = a[i] * b[i] + value;
|
||||
}
|
||||
|
||||
shArray[threadIdx.x] = value;
|
||||
__syncthreads();
|
||||
reduceInShared_native(shArray);
|
||||
shArray[threadIdx.x] = value;
|
||||
__syncthreads();
|
||||
reduceInShared_native(shArray);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
half2 result = shArray[0];
|
||||
float f_result = (float)result.y + (float)result.x;
|
||||
results[blockIdx.x] = f_result;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
half2 result = shArray[0];
|
||||
float f_result = (float)result.y + (float)result.x;
|
||||
results[blockIdx.x] = f_result;
|
||||
}
|
||||
}
|
||||
|
||||
void generateInput(half2 *a, size_t size) {
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
half2 temp;
|
||||
temp.x = static_cast<float>(rand() % 4);
|
||||
temp.y = static_cast<float>(rand() % 2);
|
||||
a[i] = temp;
|
||||
}
|
||||
void generateInput(half2 *a, size_t size)
|
||||
{
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
half2 temp;
|
||||
temp.x = static_cast<float>(rand() % 4);
|
||||
temp.y = static_cast<float>(rand() % 2);
|
||||
a[i] = temp;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
srand((unsigned int)time(NULL));
|
||||
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
srand((unsigned int)time(NULL));
|
||||
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
|
||||
|
||||
half2 *vec[2];
|
||||
half2 *devVec[2];
|
||||
half2 *vec[2];
|
||||
half2 *devVec[2];
|
||||
|
||||
float *results;
|
||||
float *devResults;
|
||||
float *results;
|
||||
float *devResults;
|
||||
|
||||
int devID = findCudaDevice(argc, (const char **)argv);
|
||||
int devID = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
cudaDeviceProp devProp;
|
||||
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
|
||||
cudaDeviceProp devProp;
|
||||
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
|
||||
|
||||
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
|
||||
printf(
|
||||
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
|
||||
"higher.\n");
|
||||
return EXIT_WAIVED;
|
||||
}
|
||||
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
|
||||
printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
|
||||
"higher.\n");
|
||||
return EXIT_WAIVED;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
|
||||
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
|
||||
}
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
|
||||
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
|
||||
}
|
||||
|
||||
checkCudaErrors(
|
||||
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
|
||||
checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
|
||||
checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
generateInput(vec[i], size);
|
||||
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
|
||||
cudaMemcpyHostToDevice));
|
||||
}
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
generateInput(vec[i], size);
|
||||
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
|
||||
devVec[0], devVec[1], devResults, size);
|
||||
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
|
||||
|
||||
checkCudaErrors(cudaMemcpy(results, devResults,
|
||||
NUM_OF_BLOCKS * sizeof *results,
|
||||
cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
|
||||
|
||||
float result_native = 0;
|
||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||
result_native += results[i];
|
||||
}
|
||||
printf("Result native operators\t: %f \n", result_native);
|
||||
float result_native = 0;
|
||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||
result_native += results[i];
|
||||
}
|
||||
printf("Result native operators\t: %f \n", result_native);
|
||||
|
||||
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
|
||||
devVec[0], devVec[1], devResults, size);
|
||||
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
|
||||
|
||||
checkCudaErrors(cudaMemcpy(results, devResults,
|
||||
NUM_OF_BLOCKS * sizeof *results,
|
||||
cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
|
||||
|
||||
float result_intrinsics = 0;
|
||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||
result_intrinsics += results[i];
|
||||
}
|
||||
printf("Result intrinsics\t: %f \n", result_intrinsics);
|
||||
float result_intrinsics = 0;
|
||||
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
|
||||
result_intrinsics += results[i];
|
||||
}
|
||||
printf("Result intrinsics\t: %f \n", result_intrinsics);
|
||||
|
||||
printf("&&&& fp16ScalarProduct %s\n",
|
||||
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
|
||||
: "FAILED");
|
||||
printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
|
||||
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
checkCudaErrors(cudaFree(devVec[i]));
|
||||
checkCudaErrors(cudaFreeHost(vec[i]));
|
||||
}
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
checkCudaErrors(cudaFree(devVec[i]));
|
||||
checkCudaErrors(cudaFreeHost(vec[i]));
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaFree(devResults));
|
||||
checkCudaErrors(cudaFreeHost(results));
|
||||
checkCudaErrors(cudaFree(devResults));
|
||||
checkCudaErrors(cudaFreeHost(results));
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@ -40,314 +40,303 @@
|
||||
*/
|
||||
|
||||
// System includes
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// CUDA runtime
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Helper functions and utilities to work with CUDA
|
||||
#include <helper_functions.h>
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_functions.h>
|
||||
|
||||
/**
|
||||
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
|
||||
* wA is A's width and wB is B's width
|
||||
*/
|
||||
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
|
||||
float *B, int wA,
|
||||
int wB) {
|
||||
// Block index
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
|
||||
{
|
||||
// Block index
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
|
||||
// Thread index
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
// Thread index
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
int aBegin = wA * BLOCK_SIZE * by;
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
int aBegin = wA * BLOCK_SIZE * by;
|
||||
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
int aEnd = aBegin + wA - 1;
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
int aEnd = aBegin + wA - 1;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
int aStep = BLOCK_SIZE;
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
int aStep = BLOCK_SIZE;
|
||||
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
int bBegin = BLOCK_SIZE * bx;
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
int bBegin = BLOCK_SIZE * bx;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
int bStep = BLOCK_SIZE * wB;
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
int bStep = BLOCK_SIZE * wB;
|
||||
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (int a = aBegin, b = bBegin;
|
||||
a <= aEnd;
|
||||
a += aStep, b += bStep) {
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
As[ty][tx] = A[a + wA * ty + tx];
|
||||
Bs[ty][tx] = B[b + wB * ty + tx];
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
As[ty][tx] = A[a + wA * ty + tx];
|
||||
Bs[ty][tx] = B[b + wB * ty + tx];
|
||||
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
__syncthreads();
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
__syncthreads();
|
||||
|
||||
// Multiply the two matrices together;
|
||||
// each thread computes one element
|
||||
// of the block sub-matrix
|
||||
// Multiply the two matrices together;
|
||||
// each thread computes one element
|
||||
// of the block sub-matrix
|
||||
#pragma unroll
|
||||
|
||||
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
||||
Csub += As[ty][k] * Bs[k][tx];
|
||||
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
||||
Csub += As[ty][k] * Bs[k][tx];
|
||||
}
|
||||
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
}
|
||||
|
||||
void ConstantInit(float *data, int size, float val) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = val;
|
||||
}
|
||||
void ConstantInit(float *data, int size, float val)
|
||||
{
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a simple test of matrix multiplication using CUDA
|
||||
*/
|
||||
int MatrixMultiply(int argc, char **argv,
|
||||
int block_size, const dim3 &dimsA,
|
||||
const dim3 &dimsB) {
|
||||
// Allocate host memory for matrices A and B
|
||||
unsigned int size_A = dimsA.x * dimsA.y;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float *h_A;
|
||||
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
|
||||
unsigned int size_B = dimsB.x * dimsB.y;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float *h_B;
|
||||
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
|
||||
cudaStream_t stream;
|
||||
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
|
||||
{
|
||||
// Allocate host memory for matrices A and B
|
||||
unsigned int size_A = dimsA.x * dimsA.y;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float *h_A;
|
||||
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
|
||||
unsigned int size_B = dimsB.x * dimsB.y;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float *h_B;
|
||||
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
|
||||
cudaStream_t stream;
|
||||
|
||||
// Initialize host memory
|
||||
const float valB = 0.01f;
|
||||
ConstantInit(h_A, size_A, 1.0f);
|
||||
ConstantInit(h_B, size_B, valB);
|
||||
// Initialize host memory
|
||||
const float valB = 0.01f;
|
||||
ConstantInit(h_A, size_A, 1.0f);
|
||||
ConstantInit(h_B, size_B, valB);
|
||||
|
||||
// Allocate device memory
|
||||
float *d_A, *d_B, *d_C;
|
||||
// Allocate device memory
|
||||
float *d_A, *d_B, *d_C;
|
||||
|
||||
// Allocate host matrix C
|
||||
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
||||
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
||||
float *h_C;
|
||||
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
|
||||
// Allocate host matrix C
|
||||
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
||||
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
||||
float *h_C;
|
||||
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
|
||||
|
||||
if (h_C == NULL) {
|
||||
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (h_C == NULL) {
|
||||
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
|
||||
// Allocate CUDA events that we'll use for timing
|
||||
cudaEvent_t start, stop;
|
||||
checkCudaErrors(cudaEventCreate(&start));
|
||||
checkCudaErrors(cudaEventCreate(&stop));
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
|
||||
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
|
||||
// Allocate CUDA events that we'll use for timing
|
||||
cudaEvent_t start, stop;
|
||||
checkCudaErrors(cudaEventCreate(&start));
|
||||
checkCudaErrors(cudaEventCreate(&stop));
|
||||
|
||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
|
||||
// copy host memory to device
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
||||
// copy host memory to device
|
||||
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
|
||||
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
|
||||
|
||||
// Setup execution parameters
|
||||
dim3 threads(block_size, block_size);
|
||||
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
||||
// Setup execution parameters
|
||||
dim3 threads(block_size, block_size);
|
||||
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
||||
|
||||
// Create and start timer
|
||||
printf("Computing result using CUDA Kernel...\n");
|
||||
// Create and start timer
|
||||
printf("Computing result using CUDA Kernel...\n");
|
||||
|
||||
// Performs warmup operation using matrixMul CUDA kernel
|
||||
if (block_size == 16) {
|
||||
MatrixMulCUDA<16>
|
||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
} else {
|
||||
MatrixMulCUDA<32>
|
||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
}
|
||||
|
||||
printf("done\n");
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
|
||||
// Record the start event
|
||||
checkCudaErrors(cudaEventRecord(start, stream));
|
||||
|
||||
// Execute the kernel
|
||||
int nIter = 300;
|
||||
|
||||
for (int j = 0; j < nIter; j++) {
|
||||
// Performs warmup operation using matrixMul CUDA kernel
|
||||
if (block_size == 16) {
|
||||
MatrixMulCUDA<16>
|
||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
} else {
|
||||
MatrixMulCUDA<32>
|
||||
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
}
|
||||
}
|
||||
|
||||
// Record the stop event
|
||||
checkCudaErrors(cudaEventRecord(stop, stream));
|
||||
|
||||
// Wait for the stop event to complete
|
||||
checkCudaErrors(cudaEventSynchronize(stop));
|
||||
|
||||
float msecTotal = 0.0f;
|
||||
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
|
||||
|
||||
// Compute and print the performance
|
||||
float msecPerMatrixMul = msecTotal / nIter;
|
||||
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
|
||||
static_cast<double>(dimsA.y) *
|
||||
static_cast<double>(dimsB.x);
|
||||
double gigaFlops =
|
||||
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
||||
printf(
|
||||
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
||||
" WorkgroupSize= %u threads/block\n",
|
||||
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
|
||||
|
||||
// Copy result from device to host
|
||||
checkCudaErrors(
|
||||
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
|
||||
printf("Checking computed result for correctness: ");
|
||||
bool correct = true;
|
||||
|
||||
// test relative error by the formula
|
||||
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
||||
double eps = 1.e-6; // machine zero
|
||||
|
||||
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
|
||||
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
||||
double dot_length = dimsA.x;
|
||||
double abs_val = fabs(h_C[i]);
|
||||
double rel_err = abs_err / abs_val / dot_length;
|
||||
|
||||
if (rel_err > eps) {
|
||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
|
||||
i, h_C[i], dimsA.x * valB, eps);
|
||||
correct = false;
|
||||
else {
|
||||
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
printf("done\n");
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
|
||||
// Clean up memory
|
||||
checkCudaErrors(cudaFreeHost(h_A));
|
||||
checkCudaErrors(cudaFreeHost(h_B));
|
||||
checkCudaErrors(cudaFreeHost(h_C));
|
||||
checkCudaErrors(cudaFree(d_A));
|
||||
checkCudaErrors(cudaFree(d_B));
|
||||
checkCudaErrors(cudaFree(d_C));
|
||||
checkCudaErrors(cudaEventDestroy(start));
|
||||
checkCudaErrors(cudaEventDestroy(stop));
|
||||
printf(
|
||||
"\nNOTE: The CUDA Samples are not meant for performance "
|
||||
"measurements. Results may vary when GPU Boost is enabled.\n");
|
||||
// Record the start event
|
||||
checkCudaErrors(cudaEventRecord(start, stream));
|
||||
|
||||
if (correct) {
|
||||
return EXIT_SUCCESS;
|
||||
} else {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
// Execute the kernel
|
||||
int nIter = 300;
|
||||
|
||||
for (int j = 0; j < nIter; j++) {
|
||||
if (block_size == 16) {
|
||||
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
}
|
||||
else {
|
||||
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
|
||||
}
|
||||
}
|
||||
|
||||
// Record the stop event
|
||||
checkCudaErrors(cudaEventRecord(stop, stream));
|
||||
|
||||
// Wait for the stop event to complete
|
||||
checkCudaErrors(cudaEventSynchronize(stop));
|
||||
|
||||
float msecTotal = 0.0f;
|
||||
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
|
||||
|
||||
// Compute and print the performance
|
||||
float msecPerMatrixMul = msecTotal / nIter;
|
||||
double flopsPerMatrixMul =
|
||||
2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
|
||||
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
|
||||
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
|
||||
" WorkgroupSize= %u threads/block\n",
|
||||
gigaFlops,
|
||||
msecPerMatrixMul,
|
||||
flopsPerMatrixMul,
|
||||
threads.x * threads.y);
|
||||
|
||||
// Copy result from device to host
|
||||
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
|
||||
printf("Checking computed result for correctness: ");
|
||||
bool correct = true;
|
||||
|
||||
// test relative error by the formula
|
||||
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
||||
double eps = 1.e-6; // machine zero
|
||||
|
||||
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
|
||||
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
||||
double dot_length = dimsA.x;
|
||||
double abs_val = fabs(h_C[i]);
|
||||
double rel_err = abs_err / abs_val / dot_length;
|
||||
|
||||
if (rel_err > eps) {
|
||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
|
||||
// Clean up memory
|
||||
checkCudaErrors(cudaFreeHost(h_A));
|
||||
checkCudaErrors(cudaFreeHost(h_B));
|
||||
checkCudaErrors(cudaFreeHost(h_C));
|
||||
checkCudaErrors(cudaFree(d_A));
|
||||
checkCudaErrors(cudaFree(d_B));
|
||||
checkCudaErrors(cudaFree(d_C));
|
||||
checkCudaErrors(cudaEventDestroy(start));
|
||||
checkCudaErrors(cudaEventDestroy(stop));
|
||||
printf("\nNOTE: The CUDA Samples are not meant for performance "
|
||||
"measurements. Results may vary when GPU Boost is enabled.\n");
|
||||
|
||||
if (correct) {
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
else {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Program main
|
||||
*/
|
||||
int main(int argc, char **argv) {
|
||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
||||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||
printf(" Note: Outer matrix dimensions of A & B matrices" \
|
||||
" must be equal.\n");
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||
printf(" Note: Outer matrix dimensions of A & B matrices"
|
||||
" must be equal.\n");
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// This will pick the best possible CUDA capable device, otherwise
|
||||
// override the device ID based on input provided at the command line
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
// This will pick the best possible CUDA capable device, otherwise
|
||||
// override the device ID based on input provided at the command line
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
int block_size = 32;
|
||||
int block_size = 32;
|
||||
|
||||
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
||||
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
||||
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
||||
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
||||
|
||||
// width of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
||||
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
||||
}
|
||||
// width of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
||||
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
||||
}
|
||||
|
||||
// height of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
||||
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
||||
}
|
||||
// height of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
||||
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
||||
}
|
||||
|
||||
// width of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
||||
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
||||
}
|
||||
// width of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
||||
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
||||
}
|
||||
|
||||
// height of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
||||
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
||||
}
|
||||
// height of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
||||
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
||||
}
|
||||
|
||||
if (dimsA.x != dimsB.y) {
|
||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
|
||||
dimsA.x, dimsB.y);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (dimsA.x != dimsB.y) {
|
||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
|
||||
dimsB.x, dimsB.y);
|
||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
|
||||
|
||||
checkCudaErrors(cudaProfilerStart());
|
||||
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||
checkCudaErrors(cudaProfilerStop());
|
||||
checkCudaErrors(cudaProfilerStart());
|
||||
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||
checkCudaErrors(cudaProfilerStop());
|
||||
|
||||
exit(matrix_result);
|
||||
exit(matrix_result);
|
||||
}
|
||||
|
@ -30,11 +30,11 @@
|
||||
|
||||
// Matrix dimensions
|
||||
// (chosen as multiples of the thread block size for simplicity)
|
||||
#define WA (4 * block_size) // Matrix A width
|
||||
#define HA (6 * block_size) // Matrix A height
|
||||
#define WB (4 * block_size) // Matrix B width
|
||||
#define HB WA // Matrix B height
|
||||
#define WC WB // Matrix C width
|
||||
#define HC HA // Matrix C height
|
||||
#define WA (4 * block_size) // Matrix A width
|
||||
#define HA (6 * block_size) // Matrix A height
|
||||
#define WB (4 * block_size) // Matrix B width
|
||||
#define HB WA // Matrix B height
|
||||
#define WC WB // Matrix C width
|
||||
#define HC HA // Matrix C height
|
||||
|
||||
#endif // _MATRIXMUL_H_
|
||||
#endif // _MATRIXMUL_H_
|
||||
|
@ -46,23 +46,23 @@
|
||||
|
||||
// includes, system
|
||||
#include <builtin_types.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
// includes, project, CUDA
|
||||
#include <cstring>
|
||||
#include <cuda.h>
|
||||
#include <helper_cuda_drvapi.h>
|
||||
#include <helper_image.h>
|
||||
#include <helper_string.h>
|
||||
#include <helper_timer.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "matrixMul.h"
|
||||
|
||||
|
||||
@ -71,11 +71,9 @@
|
||||
void runTest(int argc, char **argv);
|
||||
void randomInit(float *, int);
|
||||
|
||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
||||
unsigned int, unsigned int);
|
||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
||||
int *blk_size);
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
|
||||
|
||||
#ifndef FATBIN_FILE
|
||||
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
|
||||
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Globals
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CUdevice cuDevice;
|
||||
CUdevice cuDevice;
|
||||
CUcontext cuContext;
|
||||
CUmodule cuModule;
|
||||
size_t totalGlobalMem;
|
||||
CUmodule cuModule;
|
||||
size_t totalGlobalMem;
|
||||
|
||||
const char *sSDKsample = "matrixMulDrv (Driver API)";
|
||||
|
||||
void constantInit(float *data, int size, float val) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = val;
|
||||
}
|
||||
void constantInit(float *data, int size, float val)
|
||||
{
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
printf("[ %s ]\n", sSDKsample);
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("[ %s ]\n", sSDKsample);
|
||||
|
||||
runTest(argc, argv);
|
||||
runTest(argc, argv);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Run a simple test for CUDA
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void runTest(int argc, char **argv) {
|
||||
// initialize CUDA
|
||||
CUfunction matrixMul = NULL;
|
||||
int block_size = 0;
|
||||
void runTest(int argc, char **argv)
|
||||
{
|
||||
// initialize CUDA
|
||||
CUfunction matrixMul = NULL;
|
||||
int block_size = 0;
|
||||
|
||||
initCUDA(argc, argv, &matrixMul, &block_size);
|
||||
initCUDA(argc, argv, &matrixMul, &block_size);
|
||||
|
||||
// set seed for rand()
|
||||
srand(2006);
|
||||
// set seed for rand()
|
||||
srand(2006);
|
||||
|
||||
// allocate host memory for matrices A and B
|
||||
unsigned int size_A = WA * HA;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
|
||||
unsigned int size_B = WB * HB;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
|
||||
// allocate host memory for matrices A and B
|
||||
unsigned int size_A = WA * HA;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
|
||||
unsigned int size_B = WB * HB;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
|
||||
|
||||
// initialize host memory
|
||||
const float valB = 0.01f;
|
||||
constantInit(h_A, size_A, 1.0f);
|
||||
constantInit(h_B, size_B, valB);
|
||||
// initialize host memory
|
||||
const float valB = 0.01f;
|
||||
constantInit(h_A, size_A, 1.0f);
|
||||
constantInit(h_B, size_B, valB);
|
||||
|
||||
// allocate device memory
|
||||
CUdeviceptr d_A;
|
||||
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
||||
CUdeviceptr d_B;
|
||||
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
||||
// allocate device memory
|
||||
CUdeviceptr d_A;
|
||||
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
||||
CUdeviceptr d_B;
|
||||
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
||||
|
||||
// copy host memory to device
|
||||
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||
// copy host memory to device
|
||||
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||
|
||||
// allocate device memory for result
|
||||
size_t size_C = WC * HC;
|
||||
size_t mem_size_C = sizeof(float) * size_C;
|
||||
// allocate device memory for result
|
||||
size_t size_C = WC * HC;
|
||||
size_t mem_size_C = sizeof(float) * size_C;
|
||||
|
||||
CUdeviceptr d_C;
|
||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||
CUdeviceptr d_C;
|
||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||
|
||||
// allocate mem for the result on host side
|
||||
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
|
||||
// allocate mem for the result on host side
|
||||
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
|
||||
|
||||
// create and start timer
|
||||
StopWatchInterface *timer = NULL;
|
||||
sdkCreateTimer(&timer);
|
||||
// create and start timer
|
||||
StopWatchInterface *timer = NULL;
|
||||
sdkCreateTimer(&timer);
|
||||
|
||||
// start the timer
|
||||
sdkStartTimer(&timer);
|
||||
// start the timer
|
||||
sdkStartTimer(&timer);
|
||||
|
||||
// There are two ways to launch CUDA kernels via the Driver API.
|
||||
// In this CUDA Sample, we illustrate both ways to pass parameters
|
||||
// and specify parameters. By default we use the simpler method.
|
||||
dim3 block(block_size, block_size, 1);
|
||||
dim3 grid(WC / block_size, HC / block_size, 1);
|
||||
// There are two ways to launch CUDA kernels via the Driver API.
|
||||
// In this CUDA Sample, we illustrate both ways to pass parameters
|
||||
// and specify parameters. By default we use the simpler method.
|
||||
dim3 block(block_size, block_size, 1);
|
||||
dim3 grid(WC / block_size, HC / block_size, 1);
|
||||
|
||||
if (1) {
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||
// Launching (simplier method)
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||
// new CUDA 4.0 Driver API Kernel launch call
|
||||
checkCudaErrors(cuLaunchKernel(
|
||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
||||
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
|
||||
} else {
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||
// Launching (advanced method)
|
||||
int offset = 0;
|
||||
char argBuffer[256];
|
||||
|
||||
// pass in launch parameters (not actually de-referencing CUdeviceptr).
|
||||
// CUdeviceptr is storing the value of the parameters
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
|
||||
offset += sizeof(d_C);
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
|
||||
offset += sizeof(d_A);
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
|
||||
offset += sizeof(d_B);
|
||||
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
|
||||
offset += sizeof(Matrix_Width_A);
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
|
||||
offset += sizeof(Matrix_Width_B);
|
||||
|
||||
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
|
||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
|
||||
CU_LAUNCH_PARAM_END};
|
||||
|
||||
// new CUDA 4.0 Driver API Kernel launch call
|
||||
checkCudaErrors(cuLaunchKernel(
|
||||
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
|
||||
2 * block_size * block_size * sizeof(float), NULL, NULL,
|
||||
reinterpret_cast<void **>(&kernel_launch_config)));
|
||||
}
|
||||
|
||||
// copy result from device to host
|
||||
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
|
||||
|
||||
// stop and destroy timer
|
||||
sdkStopTimer(&timer);
|
||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||
sdkDeleteTimer(&timer);
|
||||
|
||||
printf("Checking computed result for correctness: ");
|
||||
bool correct = true;
|
||||
|
||||
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
|
||||
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
|
||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
|
||||
h_C[i], WA * valB);
|
||||
correct = false;
|
||||
if (1) {
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||
// Launching (simplier method)
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||
// new CUDA 4.0 Driver API Kernel launch call
|
||||
checkCudaErrors(cuLaunchKernel(matrixMul,
|
||||
grid.x,
|
||||
grid.y,
|
||||
grid.z,
|
||||
block.x,
|
||||
block.y,
|
||||
block.z,
|
||||
2 * block_size * block_size * sizeof(float),
|
||||
NULL,
|
||||
args,
|
||||
NULL));
|
||||
}
|
||||
}
|
||||
else {
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
|
||||
// Launching (advanced method)
|
||||
int offset = 0;
|
||||
char argBuffer[256];
|
||||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
// pass in launch parameters (not actually de-referencing CUdeviceptr).
|
||||
// CUdeviceptr is storing the value of the parameters
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
|
||||
offset += sizeof(d_C);
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
|
||||
offset += sizeof(d_A);
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
|
||||
offset += sizeof(d_B);
|
||||
|
||||
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||
"Results may vary when GPU Boost is enabled.\n");
|
||||
size_t Matrix_Width_A = (size_t)WA;
|
||||
size_t Matrix_Width_B = (size_t)WB;
|
||||
|
||||
// clean up memory
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
checkCudaErrors(cuMemFree(d_A));
|
||||
checkCudaErrors(cuMemFree(d_B));
|
||||
checkCudaErrors(cuMemFree(d_C));
|
||||
checkCudaErrors(cuCtxDestroy(cuContext));
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
|
||||
offset += sizeof(Matrix_Width_A);
|
||||
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
|
||||
offset += sizeof(Matrix_Width_B);
|
||||
|
||||
void *kernel_launch_config[5] = {
|
||||
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
|
||||
|
||||
// new CUDA 4.0 Driver API Kernel launch call
|
||||
checkCudaErrors(cuLaunchKernel(matrixMul,
|
||||
grid.x,
|
||||
grid.y,
|
||||
grid.z,
|
||||
block.x,
|
||||
block.y,
|
||||
block.z,
|
||||
2 * block_size * block_size * sizeof(float),
|
||||
NULL,
|
||||
NULL,
|
||||
reinterpret_cast<void **>(&kernel_launch_config)));
|
||||
}
|
||||
|
||||
// copy result from device to host
|
||||
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
|
||||
|
||||
// stop and destroy timer
|
||||
sdkStopTimer(&timer);
|
||||
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
|
||||
sdkDeleteTimer(&timer);
|
||||
|
||||
printf("Checking computed result for correctness: ");
|
||||
bool correct = true;
|
||||
|
||||
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
|
||||
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
|
||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
|
||||
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||
"Results may vary when GPU Boost is enabled.\n");
|
||||
|
||||
// clean up memory
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
checkCudaErrors(cuMemFree(d_A));
|
||||
checkCudaErrors(cuMemFree(d_B));
|
||||
checkCudaErrors(cuMemFree(d_C));
|
||||
checkCudaErrors(cuCtxDestroy(cuContext));
|
||||
}
|
||||
|
||||
// Allocates a matrix with random float entries.
|
||||
void randomInit(float *data, int size) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = rand() / static_cast<float>(RAND_MAX);
|
||||
}
|
||||
}
|
||||
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
|
||||
int *blk_size) {
|
||||
CUfunction cuFunction = 0;
|
||||
int major = 0, minor = 0;
|
||||
char deviceName[100];
|
||||
|
||||
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
||||
|
||||
// get compute capabilities and the devicename
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
||||
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
||||
|
||||
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
|
||||
printf(" Total amount of global memory: %llu bytes\n",
|
||||
(long long unsigned int)totalGlobalMem);
|
||||
|
||||
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
|
||||
|
||||
// first search for the module path before we load the results
|
||||
std::string module_path;
|
||||
std::ostringstream fatbin;
|
||||
|
||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||
exit(EXIT_FAILURE);
|
||||
} else {
|
||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||
}
|
||||
|
||||
if (!fatbin.str().size()) {
|
||||
printf("fatbin file empty. exiting..\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Create module from binary file (FATBIN)
|
||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||
|
||||
// select the suitable kernel function
|
||||
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
|
||||
"matrixMul_bs8_64bit"};
|
||||
|
||||
int idx = 0;
|
||||
int block_size = 32;
|
||||
while (idx < 3) {
|
||||
int threadsPerBlock = 0;
|
||||
int blocksPerGrid = 0;
|
||||
|
||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
||||
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
||||
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
|
||||
2 * block_size * block_size * sizeof(float), 0));
|
||||
if (block_size * block_size <= threadsPerBlock) {
|
||||
printf("> %d block size selected\n", block_size);
|
||||
break;
|
||||
} else {
|
||||
block_size /= 2;
|
||||
void randomInit(float *data, int size)
|
||||
{
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = rand() / static_cast<float>(RAND_MAX);
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
*pMatrixMul = cuFunction;
|
||||
*blk_size = block_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
|
||||
{
|
||||
CUfunction cuFunction = 0;
|
||||
int major = 0, minor = 0;
|
||||
char deviceName[100];
|
||||
|
||||
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
|
||||
|
||||
// get compute capabilities and the devicename
|
||||
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
|
||||
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
|
||||
|
||||
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
|
||||
printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
|
||||
|
||||
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
|
||||
|
||||
// first search for the module path before we load the results
|
||||
std::string module_path;
|
||||
std::ostringstream fatbin;
|
||||
|
||||
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
else {
|
||||
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
|
||||
}
|
||||
|
||||
if (!fatbin.str().size()) {
|
||||
printf("fatbin file empty. exiting..\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Create module from binary file (FATBIN)
|
||||
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
|
||||
|
||||
// select the suitable kernel function
|
||||
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
|
||||
|
||||
int idx = 0;
|
||||
int block_size = 32;
|
||||
while (idx < 3) {
|
||||
int threadsPerBlock = 0;
|
||||
int blocksPerGrid = 0;
|
||||
|
||||
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
|
||||
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
|
||||
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
|
||||
if (block_size * block_size <= threadsPerBlock) {
|
||||
printf("> %d block size selected\n", block_size);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
block_size /= 2;
|
||||
}
|
||||
idx++;
|
||||
}
|
||||
|
||||
*pMatrixMul = cuFunction;
|
||||
*blk_size = block_size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -42,86 +42,87 @@
|
||||
//! wA is A's width and wB is B's width
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <int block_size, typename size_type>
|
||||
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
|
||||
size_type wB) {
|
||||
// Block index
|
||||
size_type bx = blockIdx.x;
|
||||
size_type by = blockIdx.y;
|
||||
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
|
||||
{
|
||||
// Block index
|
||||
size_type bx = blockIdx.x;
|
||||
size_type by = blockIdx.y;
|
||||
|
||||
// Thread index
|
||||
size_type tx = threadIdx.x;
|
||||
size_type ty = threadIdx.y;
|
||||
// Thread index
|
||||
size_type tx = threadIdx.x;
|
||||
size_type ty = threadIdx.y;
|
||||
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
size_type aBegin = wA * block_size * by;
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
size_type aBegin = wA * block_size * by;
|
||||
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
size_type aEnd = aBegin + wA - 1;
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
size_type aEnd = aBegin + wA - 1;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
size_type aStep = block_size;
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
size_type aStep = block_size;
|
||||
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
size_type bBegin = block_size * bx;
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
size_type bBegin = block_size * bx;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
size_type bStep = block_size * wB;
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
size_type bStep = block_size * wB;
|
||||
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[block_size][block_size];
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[block_size][block_size];
|
||||
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[block_size][block_size];
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[block_size][block_size];
|
||||
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
AS(ty, tx) = A[a + wA * ty + tx];
|
||||
BS(ty, tx) = B[b + wB * ty + tx];
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
AS(ty, tx) = A[a + wA * ty + tx];
|
||||
BS(ty, tx) = B[b + wB * ty + tx];
|
||||
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
__syncthreads();
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
__syncthreads();
|
||||
|
||||
// Multiply the two matrices together;
|
||||
// each thread computes one element
|
||||
// of the block sub-matrix
|
||||
// Multiply the two matrices together;
|
||||
// each thread computes one element
|
||||
// of the block sub-matrix
|
||||
#pragma unroll
|
||||
|
||||
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
|
||||
for (size_type k = 0; k < block_size; ++k)
|
||||
Csub += AS(ty, k) * BS(k, tx);
|
||||
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
__syncthreads();
|
||||
}
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
size_type c = wB * block_size * by + block_size * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
size_type c = wB * block_size * by + block_size * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
}
|
||||
|
||||
// C wrappers around our template kernel
|
||||
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
|
||||
size_t wA, size_t wB) {
|
||||
matrixMul<8, size_t>(C, A, B, wA, wB);
|
||||
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||
{
|
||||
matrixMul<8, size_t>(C, A, B, wA, wB);
|
||||
}
|
||||
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
|
||||
size_t wA, size_t wB) {
|
||||
matrixMul<16, size_t>(C, A, B, wA, wB);
|
||||
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||
{
|
||||
matrixMul<16, size_t>(C, A, B, wA, wB);
|
||||
}
|
||||
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
|
||||
size_t wA, size_t wB) {
|
||||
matrixMul<32, size_t>(C, A, B, wA, wB);
|
||||
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
|
||||
{
|
||||
matrixMul<32, size_t>(C, A, B, wA, wB);
|
||||
}
|
||||
|
||||
#endif // #ifndef _MATRIXMUL_KERNEL_H_
|
||||
#endif // #ifndef _MATRIXMUL_KERNEL_H_
|
||||
|
@ -15,210 +15,211 @@
|
||||
|
||||
// With these flags defined, this source file will dynamically
|
||||
// load the corresponding functions. Disabled by default.
|
||||
//#define CUDA_INIT_D3D9
|
||||
//#define CUDA_INIT_D3D10
|
||||
//#define CUDA_INIT_D3D11
|
||||
//#define CUDA_INIT_OPENGL
|
||||
// #define CUDA_INIT_D3D9
|
||||
// #define CUDA_INIT_D3D10
|
||||
// #define CUDA_INIT_D3D11
|
||||
// #define CUDA_INIT_OPENGL
|
||||
|
||||
#include <stdio.h>
|
||||
#include "cuda_drvapi_dynlink.h"
|
||||
|
||||
tcuInit *_cuInit;
|
||||
tcuDriverGetVersion *cuDriverGetVersion;
|
||||
tcuDeviceGet *cuDeviceGet;
|
||||
tcuDeviceGetCount *cuDeviceGetCount;
|
||||
tcuDeviceGetName *cuDeviceGetName;
|
||||
tcuDeviceComputeCapability *cuDeviceComputeCapability;
|
||||
tcuDeviceTotalMem *cuDeviceTotalMem;
|
||||
tcuDeviceGetProperties *cuDeviceGetProperties;
|
||||
tcuDeviceGetAttribute *cuDeviceGetAttribute;
|
||||
tcuGetErrorString *cuGetErrorString;
|
||||
tcuCtxCreate *cuCtxCreate;
|
||||
tcuCtxDestroy *cuCtxDestroy;
|
||||
tcuCtxAttach *cuCtxAttach;
|
||||
tcuCtxDetach *cuCtxDetach;
|
||||
tcuCtxPushCurrent *cuCtxPushCurrent;
|
||||
tcuCtxPopCurrent *cuCtxPopCurrent;
|
||||
tcuCtxGetCurrent *cuCtxGetCurrent;
|
||||
tcuCtxSetCurrent *cuCtxSetCurrent;
|
||||
tcuCtxGetDevice *cuCtxGetDevice;
|
||||
tcuCtxSynchronize *cuCtxSynchronize;
|
||||
tcuModuleLoad *cuModuleLoad;
|
||||
tcuModuleLoadData *cuModuleLoadData;
|
||||
tcuModuleLoadDataEx *cuModuleLoadDataEx;
|
||||
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
|
||||
tcuModuleUnload *cuModuleUnload;
|
||||
tcuModuleGetFunction *cuModuleGetFunction;
|
||||
tcuModuleGetGlobal *cuModuleGetGlobal;
|
||||
tcuModuleGetTexRef *cuModuleGetTexRef;
|
||||
tcuModuleGetSurfRef *cuModuleGetSurfRef;
|
||||
tcuMemGetInfo *cuMemGetInfo;
|
||||
tcuMemAlloc *cuMemAlloc;
|
||||
tcuMemAllocPitch *cuMemAllocPitch;
|
||||
tcuMemFree *cuMemFree;
|
||||
tcuMemGetAddressRange *cuMemGetAddressRange;
|
||||
tcuMemAllocHost *cuMemAllocHost;
|
||||
tcuMemFreeHost *cuMemFreeHost;
|
||||
tcuMemHostAlloc *cuMemHostAlloc;
|
||||
tcuMemHostGetFlags *cuMemHostGetFlags;
|
||||
#include <stdio.h>
|
||||
|
||||
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
|
||||
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
|
||||
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
|
||||
tcuIpcGetEventHandle *cuIpcGetEventHandle;
|
||||
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
|
||||
tcuIpcGetMemHandle *cuIpcGetMemHandle;
|
||||
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
|
||||
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
|
||||
tcuInit *_cuInit;
|
||||
tcuDriverGetVersion *cuDriverGetVersion;
|
||||
tcuDeviceGet *cuDeviceGet;
|
||||
tcuDeviceGetCount *cuDeviceGetCount;
|
||||
tcuDeviceGetName *cuDeviceGetName;
|
||||
tcuDeviceComputeCapability *cuDeviceComputeCapability;
|
||||
tcuDeviceTotalMem *cuDeviceTotalMem;
|
||||
tcuDeviceGetProperties *cuDeviceGetProperties;
|
||||
tcuDeviceGetAttribute *cuDeviceGetAttribute;
|
||||
tcuGetErrorString *cuGetErrorString;
|
||||
tcuCtxCreate *cuCtxCreate;
|
||||
tcuCtxDestroy *cuCtxDestroy;
|
||||
tcuCtxAttach *cuCtxAttach;
|
||||
tcuCtxDetach *cuCtxDetach;
|
||||
tcuCtxPushCurrent *cuCtxPushCurrent;
|
||||
tcuCtxPopCurrent *cuCtxPopCurrent;
|
||||
tcuCtxGetCurrent *cuCtxGetCurrent;
|
||||
tcuCtxSetCurrent *cuCtxSetCurrent;
|
||||
tcuCtxGetDevice *cuCtxGetDevice;
|
||||
tcuCtxSynchronize *cuCtxSynchronize;
|
||||
tcuModuleLoad *cuModuleLoad;
|
||||
tcuModuleLoadData *cuModuleLoadData;
|
||||
tcuModuleLoadDataEx *cuModuleLoadDataEx;
|
||||
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
|
||||
tcuModuleUnload *cuModuleUnload;
|
||||
tcuModuleGetFunction *cuModuleGetFunction;
|
||||
tcuModuleGetGlobal *cuModuleGetGlobal;
|
||||
tcuModuleGetTexRef *cuModuleGetTexRef;
|
||||
tcuModuleGetSurfRef *cuModuleGetSurfRef;
|
||||
tcuMemGetInfo *cuMemGetInfo;
|
||||
tcuMemAlloc *cuMemAlloc;
|
||||
tcuMemAllocPitch *cuMemAllocPitch;
|
||||
tcuMemFree *cuMemFree;
|
||||
tcuMemGetAddressRange *cuMemGetAddressRange;
|
||||
tcuMemAllocHost *cuMemAllocHost;
|
||||
tcuMemFreeHost *cuMemFreeHost;
|
||||
tcuMemHostAlloc *cuMemHostAlloc;
|
||||
tcuMemHostGetFlags *cuMemHostGetFlags;
|
||||
|
||||
tcuMemHostRegister *cuMemHostRegister;
|
||||
tcuMemHostUnregister *cuMemHostUnregister;
|
||||
tcuMemcpyHtoD *cuMemcpyHtoD;
|
||||
tcuMemcpyDtoH *cuMemcpyDtoH;
|
||||
tcuMemcpyDtoD *cuMemcpyDtoD;
|
||||
tcuMemcpyDtoA *cuMemcpyDtoA;
|
||||
tcuMemcpyAtoD *cuMemcpyAtoD;
|
||||
tcuMemcpyHtoA *cuMemcpyHtoA;
|
||||
tcuMemcpyAtoH *cuMemcpyAtoH;
|
||||
tcuMemcpyAtoA *cuMemcpyAtoA;
|
||||
tcuMemcpy2D *cuMemcpy2D;
|
||||
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
|
||||
tcuMemcpy3D *cuMemcpy3D;
|
||||
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
|
||||
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
|
||||
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
|
||||
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
|
||||
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
|
||||
tcuMemcpy2DAsync *cuMemcpy2DAsync;
|
||||
tcuMemcpy3DAsync *cuMemcpy3DAsync;
|
||||
tcuMemcpy *cuMemcpy;
|
||||
tcuMemcpyPeer *cuMemcpyPeer;
|
||||
tcuMemsetD8 *cuMemsetD8;
|
||||
tcuMemsetD16 *cuMemsetD16;
|
||||
tcuMemsetD32 *cuMemsetD32;
|
||||
tcuMemsetD2D8 *cuMemsetD2D8;
|
||||
tcuMemsetD2D16 *cuMemsetD2D16;
|
||||
tcuMemsetD2D32 *cuMemsetD2D32;
|
||||
tcuFuncSetBlockShape *cuFuncSetBlockShape;
|
||||
tcuFuncSetSharedSize *cuFuncSetSharedSize;
|
||||
tcuFuncGetAttribute *cuFuncGetAttribute;
|
||||
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
|
||||
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
|
||||
tcuLaunchKernel *cuLaunchKernel;
|
||||
tcuArrayCreate *cuArrayCreate;
|
||||
tcuArrayGetDescriptor *cuArrayGetDescriptor;
|
||||
tcuArrayDestroy *cuArrayDestroy;
|
||||
tcuArray3DCreate *cuArray3DCreate;
|
||||
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
|
||||
tcuTexRefCreate *cuTexRefCreate;
|
||||
tcuTexRefDestroy *cuTexRefDestroy;
|
||||
tcuTexRefSetArray *cuTexRefSetArray;
|
||||
tcuTexRefSetAddress *cuTexRefSetAddress;
|
||||
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
|
||||
tcuTexRefSetFormat *cuTexRefSetFormat;
|
||||
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
|
||||
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
|
||||
tcuTexRefSetFlags *cuTexRefSetFlags;
|
||||
tcuTexRefGetAddress *cuTexRefGetAddress;
|
||||
tcuTexRefGetArray *cuTexRefGetArray;
|
||||
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
|
||||
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
|
||||
tcuTexRefGetFormat *cuTexRefGetFormat;
|
||||
tcuTexRefGetFlags *cuTexRefGetFlags;
|
||||
tcuSurfRefSetArray *cuSurfRefSetArray;
|
||||
tcuSurfRefGetArray *cuSurfRefGetArray;
|
||||
tcuParamSetSize *cuParamSetSize;
|
||||
tcuParamSeti *cuParamSeti;
|
||||
tcuParamSetf *cuParamSetf;
|
||||
tcuParamSetv *cuParamSetv;
|
||||
tcuParamSetTexRef *cuParamSetTexRef;
|
||||
tcuLaunch *cuLaunch;
|
||||
tcuLaunchGrid *cuLaunchGrid;
|
||||
tcuLaunchGridAsync *cuLaunchGridAsync;
|
||||
tcuEventCreate *cuEventCreate;
|
||||
tcuEventRecord *cuEventRecord;
|
||||
tcuEventQuery *cuEventQuery;
|
||||
tcuEventSynchronize *cuEventSynchronize;
|
||||
tcuEventDestroy *cuEventDestroy;
|
||||
tcuEventElapsedTime *cuEventElapsedTime;
|
||||
tcuStreamCreate *cuStreamCreate;
|
||||
tcuStreamWaitEvent *cuStreamWaitEvent;
|
||||
tcuStreamAddCallback *cuStreamAddCallback;
|
||||
tcuStreamQuery *cuStreamQuery;
|
||||
tcuStreamSynchronize *cuStreamSynchronize;
|
||||
tcuStreamDestroy *cuStreamDestroy;
|
||||
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
|
||||
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
|
||||
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
|
||||
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
|
||||
tcuGraphicsMapResources *cuGraphicsMapResources;
|
||||
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
|
||||
tcuGetExportTable *cuGetExportTable;
|
||||
tcuCtxSetLimit *cuCtxSetLimit;
|
||||
tcuCtxGetLimit *cuCtxGetLimit;
|
||||
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
|
||||
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
|
||||
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
|
||||
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
|
||||
tcuCtxGetApiVersion *cuCtxGetApiVersion;
|
||||
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
|
||||
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
|
||||
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
|
||||
tcuIpcGetEventHandle *cuIpcGetEventHandle;
|
||||
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
|
||||
tcuIpcGetMemHandle *cuIpcGetMemHandle;
|
||||
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
|
||||
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
|
||||
|
||||
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
|
||||
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
|
||||
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
|
||||
tcuMemHostRegister *cuMemHostRegister;
|
||||
tcuMemHostUnregister *cuMemHostUnregister;
|
||||
tcuMemcpyHtoD *cuMemcpyHtoD;
|
||||
tcuMemcpyDtoH *cuMemcpyDtoH;
|
||||
tcuMemcpyDtoD *cuMemcpyDtoD;
|
||||
tcuMemcpyDtoA *cuMemcpyDtoA;
|
||||
tcuMemcpyAtoD *cuMemcpyAtoD;
|
||||
tcuMemcpyHtoA *cuMemcpyHtoA;
|
||||
tcuMemcpyAtoH *cuMemcpyAtoH;
|
||||
tcuMemcpyAtoA *cuMemcpyAtoA;
|
||||
tcuMemcpy2D *cuMemcpy2D;
|
||||
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
|
||||
tcuMemcpy3D *cuMemcpy3D;
|
||||
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
|
||||
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
|
||||
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
|
||||
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
|
||||
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
|
||||
tcuMemcpy2DAsync *cuMemcpy2DAsync;
|
||||
tcuMemcpy3DAsync *cuMemcpy3DAsync;
|
||||
tcuMemcpy *cuMemcpy;
|
||||
tcuMemcpyPeer *cuMemcpyPeer;
|
||||
tcuMemsetD8 *cuMemsetD8;
|
||||
tcuMemsetD16 *cuMemsetD16;
|
||||
tcuMemsetD32 *cuMemsetD32;
|
||||
tcuMemsetD2D8 *cuMemsetD2D8;
|
||||
tcuMemsetD2D16 *cuMemsetD2D16;
|
||||
tcuMemsetD2D32 *cuMemsetD2D32;
|
||||
tcuFuncSetBlockShape *cuFuncSetBlockShape;
|
||||
tcuFuncSetSharedSize *cuFuncSetSharedSize;
|
||||
tcuFuncGetAttribute *cuFuncGetAttribute;
|
||||
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
|
||||
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
|
||||
tcuLaunchKernel *cuLaunchKernel;
|
||||
tcuArrayCreate *cuArrayCreate;
|
||||
tcuArrayGetDescriptor *cuArrayGetDescriptor;
|
||||
tcuArrayDestroy *cuArrayDestroy;
|
||||
tcuArray3DCreate *cuArray3DCreate;
|
||||
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
|
||||
tcuTexRefCreate *cuTexRefCreate;
|
||||
tcuTexRefDestroy *cuTexRefDestroy;
|
||||
tcuTexRefSetArray *cuTexRefSetArray;
|
||||
tcuTexRefSetAddress *cuTexRefSetAddress;
|
||||
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
|
||||
tcuTexRefSetFormat *cuTexRefSetFormat;
|
||||
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
|
||||
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
|
||||
tcuTexRefSetFlags *cuTexRefSetFlags;
|
||||
tcuTexRefGetAddress *cuTexRefGetAddress;
|
||||
tcuTexRefGetArray *cuTexRefGetArray;
|
||||
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
|
||||
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
|
||||
tcuTexRefGetFormat *cuTexRefGetFormat;
|
||||
tcuTexRefGetFlags *cuTexRefGetFlags;
|
||||
tcuSurfRefSetArray *cuSurfRefSetArray;
|
||||
tcuSurfRefGetArray *cuSurfRefGetArray;
|
||||
tcuParamSetSize *cuParamSetSize;
|
||||
tcuParamSeti *cuParamSeti;
|
||||
tcuParamSetf *cuParamSetf;
|
||||
tcuParamSetv *cuParamSetv;
|
||||
tcuParamSetTexRef *cuParamSetTexRef;
|
||||
tcuLaunch *cuLaunch;
|
||||
tcuLaunchGrid *cuLaunchGrid;
|
||||
tcuLaunchGridAsync *cuLaunchGridAsync;
|
||||
tcuEventCreate *cuEventCreate;
|
||||
tcuEventRecord *cuEventRecord;
|
||||
tcuEventQuery *cuEventQuery;
|
||||
tcuEventSynchronize *cuEventSynchronize;
|
||||
tcuEventDestroy *cuEventDestroy;
|
||||
tcuEventElapsedTime *cuEventElapsedTime;
|
||||
tcuStreamCreate *cuStreamCreate;
|
||||
tcuStreamWaitEvent *cuStreamWaitEvent;
|
||||
tcuStreamAddCallback *cuStreamAddCallback;
|
||||
tcuStreamQuery *cuStreamQuery;
|
||||
tcuStreamSynchronize *cuStreamSynchronize;
|
||||
tcuStreamDestroy *cuStreamDestroy;
|
||||
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
|
||||
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
|
||||
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
|
||||
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
|
||||
tcuGraphicsMapResources *cuGraphicsMapResources;
|
||||
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
|
||||
tcuGetExportTable *cuGetExportTable;
|
||||
tcuCtxSetLimit *cuCtxSetLimit;
|
||||
tcuCtxGetLimit *cuCtxGetLimit;
|
||||
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
|
||||
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
|
||||
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
|
||||
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
|
||||
tcuCtxGetApiVersion *cuCtxGetApiVersion;
|
||||
|
||||
tcuProfilerStop *cuProfilerStop;
|
||||
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
|
||||
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
|
||||
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
|
||||
|
||||
tcuProfilerStop *cuProfilerStop;
|
||||
|
||||
#ifdef CUDA_INIT_D3D9
|
||||
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
|
||||
// are deprecated; please use the ones below
|
||||
tcuD3D9Begin *cuD3D9Begin;
|
||||
tcuD3D9End *cuD3DEnd;
|
||||
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
|
||||
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
|
||||
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
|
||||
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
|
||||
tcuD3D9Begin *cuD3D9Begin;
|
||||
tcuD3D9End *cuD3DEnd;
|
||||
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
|
||||
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
|
||||
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
|
||||
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
|
||||
|
||||
// D3D9/CUDA interop (CUDA 2.x compatible)
|
||||
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
|
||||
tcuD3D9RegisterResource *cuD3D9RegisterResource;
|
||||
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
|
||||
tcuD3D9MapResources *cuD3D9MapResources;
|
||||
tcuD3D9UnmapResources *cuD3D9UnmapResources;
|
||||
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
|
||||
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
|
||||
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
|
||||
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
|
||||
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
|
||||
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
|
||||
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
|
||||
tcuD3D9RegisterResource *cuD3D9RegisterResource;
|
||||
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
|
||||
tcuD3D9MapResources *cuD3D9MapResources;
|
||||
tcuD3D9UnmapResources *cuD3D9UnmapResources;
|
||||
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
|
||||
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
|
||||
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
|
||||
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
|
||||
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
|
||||
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
|
||||
|
||||
// D3D9/CUDA interop (CUDA 2.0+)
|
||||
tcuD3D9GetDevice *cuD3D9GetDevice;
|
||||
tcuD3D9CtxCreate *cuD3D9CtxCreate;
|
||||
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
|
||||
tcuD3D9GetDevice *cuD3D9GetDevice;
|
||||
tcuD3D9CtxCreate *cuD3D9CtxCreate;
|
||||
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_INIT_D3D10
|
||||
// D3D10/CUDA interop (CUDA 3.0+)
|
||||
tcuD3D10GetDevice *cuD3D10GetDevice;
|
||||
tcuD3D10CtxCreate *cuD3D10CtxCreate;
|
||||
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
|
||||
tcuD3D10GetDevice *cuD3D10GetDevice;
|
||||
tcuD3D10CtxCreate *cuD3D10CtxCreate;
|
||||
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef CUDA_INIT_D3D11
|
||||
// D3D11/CUDA interop (CUDA 3.0+)
|
||||
tcuD3D11GetDevice *cuD3D11GetDevice;
|
||||
tcuD3D11CtxCreate *cuD3D11CtxCreate;
|
||||
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
|
||||
tcuD3D11GetDevice *cuD3D11GetDevice;
|
||||
tcuD3D11CtxCreate *cuD3D11CtxCreate;
|
||||
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
|
||||
#endif
|
||||
|
||||
// GL/CUDA interop
|
||||
#ifdef CUDA_INIT_OPENGL
|
||||
tcuGLCtxCreate *cuGLCtxCreate;
|
||||
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
|
||||
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
|
||||
tcuGLCtxCreate *cuGLCtxCreate;
|
||||
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
|
||||
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
|
||||
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
|
||||
tcuWGLGetDevice *cuWGLGetDevice;
|
||||
tcuWGLGetDevice *cuWGLGetDevice;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
||||
{
|
||||
*pInstance = LoadLibrary(__CudaLibName);
|
||||
|
||||
if (*pInstance == NULL)
|
||||
{
|
||||
if (*pInstance == NULL) {
|
||||
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
|
||||
return CUDA_ERROR_UNKNOWN;
|
||||
}
|
||||
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
|
||||
#define GET_PROC_EX(name, alias, required) \
|
||||
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", \
|
||||
#name, __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
#define GET_PROC_EX(name, alias, required) \
|
||||
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
}
|
||||
|
||||
#define GET_PROC_EX_V2(name, alias, required) \
|
||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", \
|
||||
STRINGIFY(name##_v2), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
#define GET_PROC_EX_V2(name, alias, required) \
|
||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
}
|
||||
|
||||
#define GET_PROC_EX_V3(name, alias, required) \
|
||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", \
|
||||
STRINGIFY(name##_v3), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
#define GET_PROC_EX_V3(name, alias, required) \
|
||||
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
}
|
||||
|
||||
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
|
||||
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
|
||||
|
||||
#include <dlfcn.h>
|
||||
|
||||
#if defined(__APPLE__) || defined(__MACOSX)
|
||||
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
|
||||
#elif defined(__ANDROID__)
|
||||
#if defined (__aarch64__)
|
||||
#if defined(__aarch64__)
|
||||
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
|
||||
#elif defined(__arm__)
|
||||
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
|
||||
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
||||
{
|
||||
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
|
||||
|
||||
if (*pInstance == NULL)
|
||||
{
|
||||
if (*pInstance == NULL) {
|
||||
printf("dlopen \"%s\" failed!\n", __CudaLibName);
|
||||
return CUDA_ERROR_UNKNOWN;
|
||||
}
|
||||
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
|
||||
return CUDA_SUCCESS;
|
||||
}
|
||||
|
||||
#define GET_PROC_EX(name, alias, required) \
|
||||
alias = (t##name *)dlsym(CudaDrvLib, #name); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", \
|
||||
#name, __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
#define GET_PROC_EX(name, alias, required) \
|
||||
alias = (t##name *)dlsym(CudaDrvLib, #name); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
}
|
||||
|
||||
#define GET_PROC_EX_V2(name, alias, required) \
|
||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", \
|
||||
STRINGIFY(name##_v2), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
#define GET_PROC_EX_V2(name, alias, required) \
|
||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
}
|
||||
|
||||
#define GET_PROC_EX_V3(name, alias, required) \
|
||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", \
|
||||
STRINGIFY(name##_v3), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
#define GET_PROC_EX_V3(name, alias, required) \
|
||||
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
|
||||
if (alias == NULL && required) { \
|
||||
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
|
||||
return CUDA_ERROR_UNKNOWN; \
|
||||
}
|
||||
|
||||
#else
|
||||
#error unsupported platform
|
||||
#endif
|
||||
|
||||
#define CHECKED_CALL(call) \
|
||||
do { \
|
||||
CUresult result = (call); \
|
||||
if (CUDA_SUCCESS != result) { \
|
||||
return result; \
|
||||
} \
|
||||
} while(0)
|
||||
#define CHECKED_CALL(call) \
|
||||
do { \
|
||||
CUresult result = (call); \
|
||||
if (CUDA_SUCCESS != result) { \
|
||||
return result; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
|
||||
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
|
||||
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
|
||||
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
|
||||
#define GET_PROC(name) GET_PROC_REQUIRED(name)
|
||||
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
|
||||
#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1)
|
||||
#define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
|
||||
#define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)
|
||||
|
||||
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
{
|
||||
CUDADRIVER CudaDrvLib;
|
||||
int driverVer = 1000;
|
||||
int driverVer = 1000;
|
||||
|
||||
CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
|
||||
|
||||
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
// available since 2.2. if not present, version 1.0 is assumed
|
||||
GET_PROC_OPTIONAL(cuDriverGetVersion);
|
||||
|
||||
if (cuDriverGetVersion)
|
||||
{
|
||||
if (cuDriverGetVersion) {
|
||||
CHECKED_CALL(cuDriverGetVersion(&driverVer));
|
||||
}
|
||||
|
||||
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC(cuStreamDestroy);
|
||||
|
||||
// These are CUDA 5.0 new functions
|
||||
if (driverVer >= 5000)
|
||||
{
|
||||
if (driverVer >= 5000) {
|
||||
GET_PROC(cuMipmappedArrayCreate);
|
||||
GET_PROC(cuMipmappedArrayDestroy);
|
||||
GET_PROC(cuMipmappedArrayGetLevel);
|
||||
}
|
||||
|
||||
// These are CUDA 4.2 new functions
|
||||
if (driverVer >= 4020)
|
||||
{
|
||||
if (driverVer >= 4020) {
|
||||
GET_PROC(cuFuncSetSharedMemConfig);
|
||||
GET_PROC(cuCtxGetSharedMemConfig);
|
||||
GET_PROC(cuCtxSetSharedMemConfig);
|
||||
}
|
||||
|
||||
// These are CUDA 4.1 new functions
|
||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
|
||||
{
|
||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
|
||||
GET_PROC(cuDeviceGetByPCIBusId);
|
||||
GET_PROC(cuDeviceGetPCIBusId);
|
||||
GET_PROC(cuIpcGetEventHandle);
|
||||
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
}
|
||||
|
||||
// These could be _v2 interfaces
|
||||
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
|
||||
{
|
||||
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
|
||||
GET_PROC_V2(cuCtxDestroy);
|
||||
GET_PROC_V2(cuCtxPopCurrent);
|
||||
GET_PROC_V2(cuCtxPushCurrent);
|
||||
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC_V2(cuEventDestroy);
|
||||
}
|
||||
|
||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
|
||||
{
|
||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
|
||||
GET_PROC_V2(cuDeviceTotalMem);
|
||||
GET_PROC_V2(cuCtxCreate);
|
||||
GET_PROC_V2(cuModuleGetGlobal);
|
||||
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC_V2(cuTexRefSetAddress);
|
||||
GET_PROC_V2(cuTexRefGetAddress);
|
||||
|
||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
|
||||
{
|
||||
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
|
||||
GET_PROC_V3(cuTexRefSetAddress2D);
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
GET_PROC_V2(cuTexRefSetAddress2D);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
// versions earlier than 3020
|
||||
GET_PROC(cuDeviceTotalMem);
|
||||
GET_PROC(cuCtxCreate);
|
||||
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
}
|
||||
|
||||
// The following functions are specific to CUDA versions
|
||||
if (driverVer >= 4000)
|
||||
{
|
||||
if (driverVer >= 4000) {
|
||||
GET_PROC(cuCtxSetCurrent);
|
||||
GET_PROC(cuCtxGetCurrent);
|
||||
GET_PROC(cuMemHostRegister);
|
||||
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC(cuProfilerStop);
|
||||
}
|
||||
|
||||
if (driverVer >= 3010)
|
||||
{
|
||||
if (driverVer >= 3010) {
|
||||
GET_PROC(cuModuleGetSurfRef);
|
||||
GET_PROC(cuSurfRefSetArray);
|
||||
GET_PROC(cuSurfRefGetArray);
|
||||
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC(cuCtxGetLimit);
|
||||
}
|
||||
|
||||
if (driverVer >= 3000)
|
||||
{
|
||||
if (driverVer >= 3000) {
|
||||
GET_PROC(cuMemcpyDtoDAsync);
|
||||
GET_PROC(cuFuncSetCacheConfig);
|
||||
#ifdef CUDA_INIT_D3D11
|
||||
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC(cuGraphicsUnregisterResource);
|
||||
GET_PROC(cuGraphicsSubResourceGetMappedArray);
|
||||
|
||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
|
||||
{
|
||||
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
|
||||
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
GET_PROC(cuGraphicsResourceGetMappedPointer);
|
||||
}
|
||||
|
||||
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
GET_PROC(cuGetExportTable);
|
||||
}
|
||||
|
||||
if (driverVer >= 2030)
|
||||
{
|
||||
if (driverVer >= 2030) {
|
||||
GET_PROC(cuMemHostGetFlags);
|
||||
#ifdef CUDA_INIT_D3D10
|
||||
GET_PROC(cuD3D10GetDevice);
|
||||
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
|
||||
#endif
|
||||
}
|
||||
|
||||
if (driverVer >= 2010)
|
||||
{
|
||||
if (driverVer >= 2010) {
|
||||
GET_PROC(cuModuleLoadDataEx);
|
||||
GET_PROC(cuModuleLoadFatBinary);
|
||||
#ifdef CUDA_INIT_OPENGL
|
||||
GET_PROC(cuGLCtxCreate);
|
||||
GET_PROC(cuGraphicsGLRegisterBuffer);
|
||||
GET_PROC(cuGraphicsGLRegisterImage);
|
||||
# ifdef WIN32
|
||||
#ifdef WIN32
|
||||
GET_PROC(cuWGLGetDevice);
|
||||
# endif
|
||||
#endif
|
||||
#endif
|
||||
#ifdef CUDA_INIT_D3D9
|
||||
GET_PROC(cuD3D9GetDevice);
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -14,21 +14,17 @@
|
||||
#ifndef HELPER_CUDA_DRVAPI_H
|
||||
#define HELPER_CUDA_DRVAPI_H
|
||||
|
||||
#include <helper_string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <helper_string.h>
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(a, b) (a > b ? a : b)
|
||||
#endif
|
||||
|
||||
#ifndef HELPER_CUDA_DRVAPI_H
|
||||
inline int ftoi(float value) {
|
||||
return (value >= 0 ? static_cast<int>(value + 0.5)
|
||||
: static_cast<int>(value - 0.5));
|
||||
}
|
||||
inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
|
||||
#endif
|
||||
|
||||
#ifndef EXIT_WAIVED
|
||||
@ -47,311 +43,302 @@ inline int ftoi(float value) {
|
||||
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
|
||||
|
||||
// These are the inline versions for all of the SDK helper functions
|
||||
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
|
||||
if (CUDA_SUCCESS != err) {
|
||||
const char *errorStr = NULL;
|
||||
cuGetErrorString(err, &errorStr);
|
||||
fprintf(stderr,
|
||||
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
|
||||
"line %i.\n",
|
||||
err, errorStr, file, line);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
|
||||
{
|
||||
if (CUDA_SUCCESS != err) {
|
||||
const char *errorStr = NULL;
|
||||
cuGetErrorString(err, &errorStr);
|
||||
fprintf(stderr,
|
||||
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
|
||||
"line %i.\n",
|
||||
err,
|
||||
errorStr,
|
||||
file,
|
||||
line);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// This function wraps the CUDA Driver API into a template function
|
||||
template <class T>
|
||||
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
|
||||
int device) {
|
||||
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
|
||||
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
|
||||
{
|
||||
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int _ConvertSMVer2CoresDRV(int major, int minor) {
|
||||
// Defines for GPU Architecture types (using the SM version to determine the #
|
||||
// of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
|
||||
// minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
inline int _ConvertSMVer2CoresDRV(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the #
|
||||
// of cores per SM
|
||||
typedef struct
|
||||
{
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
|
||||
// minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] = {
|
||||
{0x30, 192},
|
||||
{0x32, 192},
|
||||
{0x35, 192},
|
||||
{0x37, 192},
|
||||
{0x50, 128},
|
||||
{0x52, 128},
|
||||
{0x53, 128},
|
||||
{0x60, 64},
|
||||
{0x61, 128},
|
||||
{0x62, 128},
|
||||
{0x70, 64},
|
||||
{0x72, 64},
|
||||
{0x75, 64},
|
||||
{0x80, 64},
|
||||
{0x86, 128},
|
||||
{0x87, 128},
|
||||
{0x90, 128},
|
||||
{-1, -1}};
|
||||
sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
|
||||
{0x32, 192},
|
||||
{0x35, 192},
|
||||
{0x37, 192},
|
||||
{0x50, 128},
|
||||
{0x52, 128},
|
||||
{0x53, 128},
|
||||
{0x60, 64},
|
||||
{0x61, 128},
|
||||
{0x62, 128},
|
||||
{0x70, 64},
|
||||
{0x72, 64},
|
||||
{0x75, 64},
|
||||
{0x80, 64},
|
||||
{0x86, 128},
|
||||
{0x87, 128},
|
||||
{0x90, 128},
|
||||
{-1, -1}};
|
||||
|
||||
int index = 0;
|
||||
int index = 0;
|
||||
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
index++;
|
||||
}
|
||||
|
||||
// If we don't find the values, we default use the previous one to run
|
||||
// properly
|
||||
printf(
|
||||
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
||||
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
|
||||
return nGpuArchCoresPerSM[index - 1].Cores;
|
||||
// If we don't find the values, we default use the previous one to run
|
||||
// properly
|
||||
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
|
||||
major,
|
||||
minor,
|
||||
nGpuArchCoresPerSM[index - 1].Cores);
|
||||
return nGpuArchCoresPerSM[index - 1].Cores;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
#ifdef __cuda_cuda_h__
|
||||
// General GPU Device CUDA Initialization
|
||||
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
|
||||
int cuDevice = 0;
|
||||
int deviceCount = 0;
|
||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
|
||||
{
|
||||
int cuDevice = 0;
|
||||
int deviceCount = 0;
|
||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||
|
||||
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||
|
||||
if (deviceCount == 0) {
|
||||
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (deviceCount == 0) {
|
||||
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
int dev = 0;
|
||||
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
|
||||
int dev = 0;
|
||||
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
|
||||
|
||||
if (dev < 0) {
|
||||
dev = 0;
|
||||
}
|
||||
if (dev < 0) {
|
||||
dev = 0;
|
||||
}
|
||||
|
||||
if (dev > deviceCount - 1) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
|
||||
deviceCount);
|
||||
fprintf(stderr,
|
||||
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
|
||||
dev);
|
||||
fprintf(stderr, "\n");
|
||||
return -dev;
|
||||
}
|
||||
if (dev > deviceCount - 1) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
|
||||
fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
|
||||
fprintf(stderr, "\n");
|
||||
return -dev;
|
||||
}
|
||||
|
||||
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
|
||||
char name[100];
|
||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
|
||||
char name[100];
|
||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||
|
||||
int computeMode;
|
||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
||||
int computeMode;
|
||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
|
||||
|
||||
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
|
||||
fprintf(stderr,
|
||||
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
|
||||
"threads can use this CUDA Device.\n");
|
||||
return -1;
|
||||
}
|
||||
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
|
||||
fprintf(stderr,
|
||||
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
|
||||
"threads can use this CUDA Device.\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
|
||||
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
|
||||
}
|
||||
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
|
||||
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
|
||||
}
|
||||
|
||||
return dev;
|
||||
return dev;
|
||||
}
|
||||
|
||||
// This function returns the best GPU based on performance
|
||||
inline int gpuGetMaxGflopsDeviceIdDRV() {
|
||||
CUdevice current_device = 0;
|
||||
CUdevice max_perf_device = 0;
|
||||
int device_count = 0;
|
||||
int sm_per_multiproc = 0;
|
||||
unsigned long long max_compute_perf = 0;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
int multiProcessorCount;
|
||||
int clockRate;
|
||||
int devices_prohibited = 0;
|
||||
inline int gpuGetMaxGflopsDeviceIdDRV()
|
||||
{
|
||||
CUdevice current_device = 0;
|
||||
CUdevice max_perf_device = 0;
|
||||
int device_count = 0;
|
||||
int sm_per_multiproc = 0;
|
||||
unsigned long long max_compute_perf = 0;
|
||||
int major = 0;
|
||||
int minor = 0;
|
||||
int multiProcessorCount;
|
||||
int clockRate;
|
||||
int devices_prohibited = 0;
|
||||
|
||||
cuInit(0, __CUDA_API_VERSION);
|
||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||
cuInit(0, __CUDA_API_VERSION);
|
||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||
|
||||
if (device_count == 0) {
|
||||
fprintf(stderr,
|
||||
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Find the best CUDA capable GPU device
|
||||
current_device = 0;
|
||||
|
||||
while (current_device < device_count) {
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
|
||||
current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||
|
||||
int computeMode;
|
||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
|
||||
current_device);
|
||||
|
||||
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
|
||||
if (major == 9999 && minor == 9999) {
|
||||
sm_per_multiproc = 1;
|
||||
} else {
|
||||
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
||||
}
|
||||
|
||||
unsigned long long compute_perf =
|
||||
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
|
||||
clockRate);
|
||||
|
||||
if (compute_perf > max_compute_perf) {
|
||||
max_compute_perf = compute_perf;
|
||||
max_perf_device = current_device;
|
||||
}
|
||||
} else {
|
||||
devices_prohibited++;
|
||||
if (device_count == 0) {
|
||||
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
++current_device;
|
||||
}
|
||||
// Find the best CUDA capable GPU device
|
||||
current_device = 0;
|
||||
|
||||
if (devices_prohibited == device_count) {
|
||||
fprintf(stderr,
|
||||
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
|
||||
"prohibited.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
while (current_device < device_count) {
|
||||
checkCudaErrors(
|
||||
cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||
|
||||
return max_perf_device;
|
||||
int computeMode;
|
||||
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
|
||||
|
||||
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
|
||||
if (major == 9999 && minor == 9999) {
|
||||
sm_per_multiproc = 1;
|
||||
}
|
||||
else {
|
||||
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
|
||||
}
|
||||
|
||||
unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
|
||||
|
||||
if (compute_perf > max_compute_perf) {
|
||||
max_compute_perf = compute_perf;
|
||||
max_perf_device = current_device;
|
||||
}
|
||||
}
|
||||
else {
|
||||
devices_prohibited++;
|
||||
}
|
||||
|
||||
++current_device;
|
||||
}
|
||||
|
||||
if (devices_prohibited == device_count) {
|
||||
fprintf(stderr,
|
||||
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
|
||||
"prohibited.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return max_perf_device;
|
||||
}
|
||||
|
||||
// General initialization call to pick the best CUDA Device
|
||||
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
|
||||
CUdevice cuDevice;
|
||||
int devID = 0;
|
||||
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
|
||||
{
|
||||
CUdevice cuDevice;
|
||||
int devID = 0;
|
||||
|
||||
// If the command-line has a device number specified, use it
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||||
devID = gpuDeviceInitDRV(argc, argv);
|
||||
// If the command-line has a device number specified, use it
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
|
||||
devID = gpuDeviceInitDRV(argc, argv);
|
||||
|
||||
if (devID < 0) {
|
||||
printf("exiting...\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
if (devID < 0) {
|
||||
printf("exiting...\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Otherwise pick the device with highest Gflops/s
|
||||
char name[100];
|
||||
devID = gpuGetMaxGflopsDeviceIdDRV();
|
||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||
cuDeviceGetName(name, 100, cuDevice);
|
||||
printf("> Using CUDA Device [%d]: %s\n", devID, name);
|
||||
}
|
||||
} else {
|
||||
// Otherwise pick the device with highest Gflops/s
|
||||
char name[100];
|
||||
devID = gpuGetMaxGflopsDeviceIdDRV();
|
||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||
cuDeviceGetName(name, 100, cuDevice);
|
||||
printf("> Using CUDA Device [%d]: %s\n", devID, name);
|
||||
}
|
||||
|
||||
cuDeviceGet(&cuDevice, devID);
|
||||
cuDeviceGet(&cuDevice, devID);
|
||||
|
||||
return cuDevice;
|
||||
return cuDevice;
|
||||
}
|
||||
|
||||
inline CUdevice findIntegratedGPUDrv() {
|
||||
CUdevice current_device = 0;
|
||||
int device_count = 0;
|
||||
int devices_prohibited = 0;
|
||||
int isIntegrated;
|
||||
inline CUdevice findIntegratedGPUDrv()
|
||||
{
|
||||
CUdevice current_device = 0;
|
||||
int device_count = 0;
|
||||
int devices_prohibited = 0;
|
||||
int isIntegrated;
|
||||
|
||||
cuInit(0, __CUDA_API_VERSION);
|
||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||
cuInit(0, __CUDA_API_VERSION);
|
||||
checkCudaErrors(cuDeviceGetCount(&device_count));
|
||||
|
||||
if (device_count == 0) {
|
||||
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Find the integrated GPU which is compute capable
|
||||
while (current_device < device_count) {
|
||||
int computeMode = -1;
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
|
||||
|
||||
// If GPU is integrated and is not running on Compute Mode prohibited use
|
||||
// that
|
||||
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
|
||||
int major = 0, minor = 0;
|
||||
char deviceName[256];
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
|
||||
current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
|
||||
current_device));
|
||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
||||
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
|
||||
current_device, deviceName, major, minor);
|
||||
|
||||
return current_device;
|
||||
} else {
|
||||
devices_prohibited++;
|
||||
if (device_count == 0) {
|
||||
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
current_device++;
|
||||
}
|
||||
// Find the integrated GPU which is compute capable
|
||||
while (current_device < device_count) {
|
||||
int computeMode = -1;
|
||||
checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
|
||||
|
||||
if (devices_prohibited == device_count) {
|
||||
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
// If GPU is integrated and is not running on Compute Mode prohibited use
|
||||
// that
|
||||
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
|
||||
int major = 0, minor = 0;
|
||||
char deviceName[256];
|
||||
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
|
||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
|
||||
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
|
||||
|
||||
return -1;
|
||||
return current_device;
|
||||
}
|
||||
else {
|
||||
devices_prohibited++;
|
||||
}
|
||||
|
||||
current_device++;
|
||||
}
|
||||
|
||||
if (devices_prohibited == device_count) {
|
||||
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
// General check for CUDA GPU SM Capabilities
|
||||
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
|
||||
int devID) {
|
||||
CUdevice cuDevice;
|
||||
char name[256];
|
||||
int major = 0, minor = 0;
|
||||
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
|
||||
{
|
||||
CUdevice cuDevice;
|
||||
char name[256];
|
||||
int major = 0, minor = 0;
|
||||
|
||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetAttribute(
|
||||
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
|
||||
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
|
||||
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
|
||||
|
||||
if ((major > major_version) ||
|
||||
(major == major_version && minor >= minor_version)) {
|
||||
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
|
||||
major, minor);
|
||||
return true;
|
||||
} else {
|
||||
printf(
|
||||
"No GPU device was found that can support CUDA compute capability "
|
||||
"%d.%d.\n",
|
||||
major_version, minor_version);
|
||||
return false;
|
||||
}
|
||||
if ((major > major_version) || (major == major_version && minor >= minor_version)) {
|
||||
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
printf("No GPU device was found that can support CUDA compute capability "
|
||||
"%d.%d.\n",
|
||||
major_version,
|
||||
minor_version);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// end of CUDA Helper Functions
|
||||
|
||||
#endif // HELPER_CUDA_DRVAPI_H
|
||||
// end of CUDA Helper Functions
|
||||
|
||||
#endif // HELPER_CUDA_DRVAPI_H
|
||||
|
@ -34,8 +34,8 @@
|
||||
#define WA (4 * block_size) // Matrix A width
|
||||
#define HA (6 * block_size) // Matrix A height
|
||||
#define WB (4 * block_size) // Matrix B width
|
||||
#define HB WA // Matrix B height
|
||||
#define WC WB // Matrix C width
|
||||
#define HC HA // Matrix C height
|
||||
#define HB WA // Matrix B height
|
||||
#define WC WB // Matrix C width
|
||||
#define HC HA // Matrix C height
|
||||
|
||||
#endif // _MATRIXMUL_H_
|
||||
|
@ -43,10 +43,10 @@
|
||||
*/
|
||||
|
||||
// includes, system
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
// includes, CUDA
|
||||
#include "cuda_drvapi_dynlink.h"
|
||||
@ -60,7 +60,7 @@
|
||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||
|
||||
#if defined _MSC_VER
|
||||
#pragma warning (disable : 4312)
|
||||
#pragma warning(disable : 4312)
|
||||
#endif
|
||||
|
||||
|
||||
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
|
||||
// Globals
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CUcontext g_cuContext;
|
||||
bool noprompt = false;
|
||||
bool noprompt = false;
|
||||
|
||||
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
|
||||
|
||||
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void randomInit(float *data, size_t size)
|
||||
{
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
data[i] = rand() / (float)RAND_MAX;
|
||||
}
|
||||
}
|
||||
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
|
||||
{
|
||||
CUresult status;
|
||||
CUdevice cuDevice;
|
||||
CUmodule cuModule;
|
||||
CUresult status;
|
||||
CUdevice cuDevice;
|
||||
CUmodule cuModule;
|
||||
CUfunction cuFunction;
|
||||
int major, minor, block_size, devID = 0;
|
||||
char deviceName[256];
|
||||
int major, minor, block_size, devID = 0;
|
||||
char deviceName[256];
|
||||
|
||||
// link to cuda driver dynamically
|
||||
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
|
||||
|
||||
// This assumes that the user is attempting to specify a explicit device -device=n
|
||||
if (argc > 1)
|
||||
{
|
||||
if (argc > 1) {
|
||||
bool bFound = false;
|
||||
|
||||
for (int param=0; param < argc; param++)
|
||||
{
|
||||
if (!strncmp(argv[param], "-device", 7))
|
||||
{
|
||||
int i=(int)strlen(argv[1]);
|
||||
for (int param = 0; param < argc; param++) {
|
||||
if (!strncmp(argv[param], "-device", 7)) {
|
||||
int i = (int)strlen(argv[1]);
|
||||
|
||||
while (argv[1][i] != '=')
|
||||
{
|
||||
while (argv[1][i] != '=') {
|
||||
i--;
|
||||
}
|
||||
|
||||
devID = atoi(&argv[1][++i]);
|
||||
devID = atoi(&argv[1][++i]);
|
||||
bFound = true;
|
||||
}
|
||||
|
||||
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
||||
int deviceCount = 0;
|
||||
checkCudaErrors(cuDeviceGetCount(&deviceCount));
|
||||
|
||||
if (deviceCount == 0)
|
||||
{
|
||||
if (deviceCount == 0) {
|
||||
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
if (devID < 0) devID = 0;
|
||||
if (devID < 0)
|
||||
devID = 0;
|
||||
|
||||
if (devID > deviceCount -1)
|
||||
{
|
||||
if (devID > deviceCount - 1) {
|
||||
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
|
||||
status = CUDA_ERROR_NOT_FOUND;
|
||||
|
||||
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
||||
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
|
||||
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
|
||||
|
||||
block_size = 32;
|
||||
block_size = 32;
|
||||
*block_size_out = block_size;
|
||||
|
||||
// create context for picked device
|
||||
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
|
||||
|
||||
if (CUDA_SUCCESS != status)
|
||||
{
|
||||
if (CUDA_SUCCESS != status) {
|
||||
cuCtxDestroy(g_cuContext);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
|
||||
{
|
||||
// in this branch we use compilation with parameters
|
||||
const unsigned int jitNumOptions = 3;
|
||||
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
||||
void **jitOptVals = new void *[jitNumOptions];
|
||||
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
|
||||
void **jitOptVals = new void *[jitNumOptions];
|
||||
|
||||
// set up size of compilation log buffer
|
||||
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
|
||||
int jitLogBufferSize = 1024;
|
||||
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
|
||||
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
|
||||
|
||||
// set up pointer to the compilation log buffer
|
||||
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
|
||||
char *jitLogBuffer = new char[jitLogBufferSize];
|
||||
jitOptVals[1] = jitLogBuffer;
|
||||
jitOptVals[1] = jitLogBuffer;
|
||||
|
||||
// set up pointer to set the Maximum # of registers for a particular kernel
|
||||
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
||||
jitOptions[2] = CU_JIT_MAX_REGISTERS;
|
||||
int jitRegCount = 32;
|
||||
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
||||
jitOptVals[2] = (void *)(size_t)jitRegCount;
|
||||
|
||||
// compile with set parameters
|
||||
printf("> Compiling CUDA module\n");
|
||||
|
||||
#if defined(_WIN64) || defined(__LP64__)
|
||||
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||
status =
|
||||
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||
#else
|
||||
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||
status =
|
||||
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
|
||||
#endif
|
||||
|
||||
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
|
||||
|
||||
delete [] jitOptions;
|
||||
delete [] jitOptVals;
|
||||
delete [] jitLogBuffer;
|
||||
delete[] jitOptions;
|
||||
delete[] jitOptVals;
|
||||
delete[] jitLogBuffer;
|
||||
}
|
||||
|
||||
if (CUDA_SUCCESS != status)
|
||||
{
|
||||
if (CUDA_SUCCESS != status) {
|
||||
printf("Error while compiling PTX\n");
|
||||
cuCtxDestroy(g_cuContext);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// retrieve CUDA function from the compiled module
|
||||
status = cuModuleGetFunction(&cuFunction, cuModule,
|
||||
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
|
||||
status = cuModuleGetFunction(
|
||||
&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
|
||||
|
||||
if (CUDA_SUCCESS != status)
|
||||
{
|
||||
if (CUDA_SUCCESS != status) {
|
||||
cuCtxDestroy(g_cuContext);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
@ -233,21 +226,21 @@ int main(int argc, char **argv)
|
||||
printf("[ %s ]\n", sSDKsample);
|
||||
|
||||
// initialize CUDA
|
||||
CUfunction matrixMul = NULL;
|
||||
int block_size = 0;
|
||||
CUfunction matrixMul = NULL;
|
||||
int block_size = 0;
|
||||
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
|
||||
|
||||
// set seed for rand()
|
||||
srand(2006);
|
||||
|
||||
// allocate host memory for matrices A and B
|
||||
size_t size_A = WA * HA;
|
||||
size_t mem_size_A = sizeof(float) * size_A;
|
||||
size_t size_B = WB * HB;
|
||||
size_t mem_size_B = sizeof(float) * size_B;
|
||||
size_t size_A = WA * HA;
|
||||
size_t mem_size_A = sizeof(float) * size_A;
|
||||
size_t size_B = WB * HB;
|
||||
size_t mem_size_B = sizeof(float) * size_B;
|
||||
|
||||
float *h_A = (float *) malloc(mem_size_A);
|
||||
float *h_B = (float *) malloc(mem_size_B);
|
||||
float *h_A = (float *)malloc(mem_size_A);
|
||||
float *h_B = (float *)malloc(mem_size_B);
|
||||
|
||||
// initialize host memory
|
||||
randomInit(h_A, size_A);
|
||||
@ -264,26 +257,24 @@ int main(int argc, char **argv)
|
||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||
|
||||
// allocate device memory for result
|
||||
size_t size_C = WC * HC;
|
||||
size_t mem_size_C = sizeof(float) * size_C;
|
||||
size_t size_C = WC * HC;
|
||||
size_t mem_size_C = sizeof(float) * size_C;
|
||||
|
||||
CUdeviceptr d_C;
|
||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||
|
||||
// allocate mem for the result on host side
|
||||
float *h_C = (float *) malloc(mem_size_C);
|
||||
float *h_C = (float *)malloc(mem_size_C);
|
||||
|
||||
#if __CUDA_API_VERSION >= 4000
|
||||
{
|
||||
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
|
||||
int Matrix_Width_A = WA;
|
||||
int Matrix_Width_B = WB;
|
||||
void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
|
||||
int Matrix_Width_A = WA;
|
||||
int Matrix_Width_B = WB;
|
||||
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
|
||||
|
||||
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
|
||||
block_size , block_size , 1,
|
||||
0,
|
||||
NULL, args, NULL));
|
||||
checkCudaErrors(cuLaunchKernel(
|
||||
matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
|
||||
}
|
||||
#else // __CUDA_API_VERSION <= 3020
|
||||
{
|
||||
@ -312,7 +303,7 @@ int main(int argc, char **argv)
|
||||
|
||||
checkCudaErrors(cuParamSetSize(matrixMul, offset));
|
||||
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
|
||||
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
|
||||
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
|
||||
|
||||
// set execution configuration for the CUDA kernel
|
||||
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
|
||||
@ -322,19 +313,18 @@ int main(int argc, char **argv)
|
||||
checkCudaErrors(cuCtxSynchronize());
|
||||
|
||||
// copy result from device to host
|
||||
checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
|
||||
checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
|
||||
|
||||
// compute reference solution
|
||||
float *reference = (float *) malloc(mem_size_C);
|
||||
float *reference = (float *)malloc(mem_size_C);
|
||||
computeGold(reference, h_A, h_B, HA, WA, WB);
|
||||
|
||||
// check result
|
||||
float diff=0.0f;
|
||||
float diff = 0.0f;
|
||||
|
||||
for (unsigned int i=0; i<size_C; i++)
|
||||
{
|
||||
for (unsigned int i = 0; i < size_C; i++) {
|
||||
float tmp = reference[i] - h_C[i];
|
||||
diff += tmp*tmp;
|
||||
diff += tmp * tmp;
|
||||
}
|
||||
|
||||
int res = (diff / (float)size_C < 1e-6f);
|
||||
@ -349,7 +339,7 @@ int main(int argc, char **argv)
|
||||
checkCudaErrors(cuMemFree(d_C));
|
||||
checkCudaErrors(cuCtxDestroy(g_cuContext));
|
||||
|
||||
printf("Test run %s\n", (1==res) ? "success!" : "failed!");
|
||||
printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
|
||||
|
||||
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
@ -28,8 +28,7 @@
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// export C interface
|
||||
extern "C"
|
||||
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compute reference data set
|
||||
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
|
||||
//! @param hA height of matrix A
|
||||
//! @param wB width of matrix B
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void
|
||||
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
|
||||
void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
|
||||
{
|
||||
for (unsigned int i = 0; i < hA; ++i)
|
||||
for (unsigned int j = 0; j < wB; ++j)
|
||||
{
|
||||
for (unsigned int j = 0; j < wB; ++j) {
|
||||
double sum = 0;
|
||||
|
||||
for (unsigned int k = 0; k < wA; ++k)
|
||||
{
|
||||
for (unsigned int k = 0; k < wA; ++k) {
|
||||
double a = A[i * wA + k];
|
||||
double b = B[k * wB + j];
|
||||
sum += a * b;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,8 @@
|
||||
#define __matrixMul_kernel_32_ptxdump_h__
|
||||
|
||||
#if defined __cplusplus
|
||||
extern "C" {
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
extern unsigned char matrixMul_kernel_32_ptxdump[25784];
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -32,7 +32,8 @@
|
||||
#define __matrixMul_kernel_64_ptxdump_h__
|
||||
|
||||
#if defined __cplusplus
|
||||
extern "C" {
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
extern unsigned char matrixMul_kernel_64_ptxdump[26489];
|
||||
|
@ -42,207 +42,208 @@
|
||||
*/
|
||||
|
||||
// System includes
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// CUDA runtime
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "nvrtc_helper.h"
|
||||
|
||||
// Helper functions and utilities to work with CUDA
|
||||
#include <helper_functions.h>
|
||||
|
||||
void constantInit(float *data, int size, float val) {
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = val;
|
||||
}
|
||||
void constantInit(float *data, int size, float val)
|
||||
{
|
||||
for (int i = 0; i < size; ++i) {
|
||||
data[i] = val;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a simple test of matrix multiplication using CUDA
|
||||
*/
|
||||
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
|
||||
dim3 &dimsB) {
|
||||
// Allocate host memory for matrices A and B
|
||||
unsigned int size_A = dimsA.x * dimsA.y;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float *h_A = (float *)malloc(mem_size_A);
|
||||
unsigned int size_B = dimsB.x * dimsB.y;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float *h_B = (float *)malloc(mem_size_B);
|
||||
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
|
||||
{
|
||||
// Allocate host memory for matrices A and B
|
||||
unsigned int size_A = dimsA.x * dimsA.y;
|
||||
unsigned int mem_size_A = sizeof(float) * size_A;
|
||||
float *h_A = (float *)malloc(mem_size_A);
|
||||
unsigned int size_B = dimsB.x * dimsB.y;
|
||||
unsigned int mem_size_B = sizeof(float) * size_B;
|
||||
float *h_B = (float *)malloc(mem_size_B);
|
||||
|
||||
// Initialize host memory
|
||||
const float valB = 0.01f;
|
||||
constantInit(h_A, size_A, 1.0f);
|
||||
constantInit(h_B, size_B, valB);
|
||||
// Initialize host memory
|
||||
const float valB = 0.01f;
|
||||
constantInit(h_A, size_A, 1.0f);
|
||||
constantInit(h_B, size_B, valB);
|
||||
|
||||
// Allocate device memory
|
||||
CUdeviceptr d_A, d_B, d_C;
|
||||
// Allocate device memory
|
||||
CUdeviceptr d_A, d_B, d_C;
|
||||
|
||||
char *cubin, *kernel_file;
|
||||
size_t cubinSize;
|
||||
char *cubin, *kernel_file;
|
||||
size_t cubinSize;
|
||||
|
||||
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
|
||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
|
||||
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
|
||||
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
|
||||
|
||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||
CUmodule module = loadCUBIN(cubin, argc, argv);
|
||||
|
||||
// Allocate host matrix C
|
||||
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
||||
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
||||
float *h_C = (float *)malloc(mem_size_C);
|
||||
// Allocate host matrix C
|
||||
dim3 dimsC(dimsB.x, dimsA.y, 1);
|
||||
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
|
||||
float *h_C = (float *)malloc(mem_size_C);
|
||||
|
||||
if (h_C == NULL) {
|
||||
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
||||
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||
|
||||
// copy host memory to device
|
||||
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||
|
||||
// Setup execution parameters
|
||||
dim3 threads(block_size, block_size);
|
||||
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
||||
|
||||
// Create and start timer
|
||||
printf("Computing result using CUDA Kernel...\n");
|
||||
|
||||
CUfunction kernel_addr;
|
||||
if (block_size == 16) {
|
||||
checkCudaErrors(
|
||||
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
|
||||
} else {
|
||||
checkCudaErrors(
|
||||
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
|
||||
}
|
||||
|
||||
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
|
||||
(void *)&dimsB.x};
|
||||
|
||||
// Execute the kernel
|
||||
int nIter = 300;
|
||||
|
||||
for (int j = 0; j < nIter; j++) {
|
||||
checkCudaErrors(
|
||||
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
|
||||
threads.x, threads.y, threads.z, /* block dim */
|
||||
0, 0, /* shared mem, stream */
|
||||
&arr[0], /* arguments */
|
||||
0));
|
||||
|
||||
checkCudaErrors(cuCtxSynchronize());
|
||||
}
|
||||
|
||||
// Copy result from device to host
|
||||
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
|
||||
|
||||
printf("Checking computed result for correctness: ");
|
||||
|
||||
bool correct = true;
|
||||
|
||||
// test relative error by the formula
|
||||
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
||||
|
||||
double eps = 1.e-6; // machine zero
|
||||
|
||||
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
|
||||
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
||||
double dot_length = dimsA.x;
|
||||
double abs_val = fabs(h_C[i]);
|
||||
double rel_err = abs_err / abs_val / dot_length;
|
||||
|
||||
if (rel_err > eps) {
|
||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
|
||||
h_C[i], dimsA.x * valB, eps);
|
||||
correct = false;
|
||||
if (h_C == NULL) {
|
||||
fprintf(stderr, "Failed to allocate host matrix C!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
|
||||
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
|
||||
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
|
||||
|
||||
printf(
|
||||
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||
"Results may vary when GPU Boost is enabled.\n");
|
||||
// copy host memory to device
|
||||
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
|
||||
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
|
||||
|
||||
// Clean up memory
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
// Setup execution parameters
|
||||
dim3 threads(block_size, block_size);
|
||||
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
|
||||
|
||||
checkCudaErrors(cuMemFree(d_A));
|
||||
checkCudaErrors(cuMemFree(d_B));
|
||||
checkCudaErrors(cuMemFree(d_C));
|
||||
// Create and start timer
|
||||
printf("Computing result using CUDA Kernel...\n");
|
||||
|
||||
if (correct) {
|
||||
return EXIT_SUCCESS;
|
||||
} else {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
CUfunction kernel_addr;
|
||||
if (block_size == 16) {
|
||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
|
||||
}
|
||||
else {
|
||||
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
|
||||
}
|
||||
|
||||
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
|
||||
|
||||
// Execute the kernel
|
||||
int nIter = 300;
|
||||
|
||||
for (int j = 0; j < nIter; j++) {
|
||||
checkCudaErrors(cuLaunchKernel(kernel_addr,
|
||||
grid.x,
|
||||
grid.y,
|
||||
grid.z, /* grid dim */
|
||||
threads.x,
|
||||
threads.y,
|
||||
threads.z, /* block dim */
|
||||
0,
|
||||
0, /* shared mem, stream */
|
||||
&arr[0], /* arguments */
|
||||
0));
|
||||
|
||||
checkCudaErrors(cuCtxSynchronize());
|
||||
}
|
||||
|
||||
// Copy result from device to host
|
||||
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
|
||||
|
||||
printf("Checking computed result for correctness: ");
|
||||
|
||||
bool correct = true;
|
||||
|
||||
// test relative error by the formula
|
||||
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
|
||||
|
||||
double eps = 1.e-6; // machine zero
|
||||
|
||||
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
|
||||
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
|
||||
double dot_length = dimsA.x;
|
||||
double abs_val = fabs(h_C[i]);
|
||||
double rel_err = abs_err / abs_val / dot_length;
|
||||
|
||||
if (rel_err > eps) {
|
||||
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
|
||||
|
||||
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
|
||||
"Results may vary when GPU Boost is enabled.\n");
|
||||
|
||||
// Clean up memory
|
||||
free(h_A);
|
||||
free(h_B);
|
||||
free(h_C);
|
||||
|
||||
checkCudaErrors(cuMemFree(d_A));
|
||||
checkCudaErrors(cuMemFree(d_B));
|
||||
checkCudaErrors(cuMemFree(d_C));
|
||||
|
||||
if (correct) {
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
else {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Program main
|
||||
*/
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("[Matrix Multiply Using CUDA] - Starting...\n");
|
||||
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
|
||||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||
printf(
|
||||
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
|
||||
printf("Usage -device=n (n >= 0 for deviceID)\n");
|
||||
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
|
||||
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
|
||||
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
|
||||
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
int block_size = 32;
|
||||
int block_size = 32;
|
||||
|
||||
// original:
|
||||
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
||||
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
||||
// original:
|
||||
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
|
||||
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
|
||||
|
||||
// reduce sizes to avoid running out of memory
|
||||
// dim3 dimsA(32,32, 1);
|
||||
// dim3 dimsB(32,32,1);
|
||||
// reduce sizes to avoid running out of memory
|
||||
// dim3 dimsA(32,32, 1);
|
||||
// dim3 dimsB(32,32,1);
|
||||
|
||||
// width of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
||||
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
||||
}
|
||||
// width of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
|
||||
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
|
||||
}
|
||||
|
||||
// height of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
||||
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
||||
}
|
||||
// height of Matrix A
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
|
||||
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
|
||||
}
|
||||
|
||||
// width of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
||||
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
||||
}
|
||||
// width of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
|
||||
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
|
||||
}
|
||||
|
||||
// height of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
||||
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
||||
}
|
||||
// height of Matrix B
|
||||
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
|
||||
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
|
||||
}
|
||||
|
||||
if (dimsA.x != dimsB.y) {
|
||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
|
||||
dimsA.x, dimsB.y);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (dimsA.x != dimsB.y) {
|
||||
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
|
||||
dimsB.y);
|
||||
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
|
||||
|
||||
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
|
||||
|
||||
exit(matrix_result);
|
||||
exit(matrix_result);
|
||||
}
|
||||
|
@ -48,84 +48,83 @@
|
||||
|
||||
#include <cooperative_groups.h>
|
||||
|
||||
template <int BLOCK_SIZE>
|
||||
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
|
||||
// Handle to thread block group
|
||||
cooperative_groups::thread_block cta =
|
||||
cooperative_groups::this_thread_block();
|
||||
// Block index
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
|
||||
{
|
||||
// Handle to thread block group
|
||||
cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
|
||||
// Block index
|
||||
int bx = blockIdx.x;
|
||||
int by = blockIdx.y;
|
||||
|
||||
// Thread index
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
// Thread index
|
||||
int tx = threadIdx.x;
|
||||
int ty = threadIdx.y;
|
||||
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
int aBegin = wA * BLOCK_SIZE * by;
|
||||
// Index of the first sub-matrix of A processed by the block
|
||||
int aBegin = wA * BLOCK_SIZE * by;
|
||||
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
int aEnd = aBegin + wA - 1;
|
||||
// Index of the last sub-matrix of A processed by the block
|
||||
int aEnd = aBegin + wA - 1;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
int aStep = BLOCK_SIZE;
|
||||
// Step size used to iterate through the sub-matrices of A
|
||||
int aStep = BLOCK_SIZE;
|
||||
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
int bBegin = BLOCK_SIZE * bx;
|
||||
// Index of the first sub-matrix of B processed by the block
|
||||
int bBegin = BLOCK_SIZE * bx;
|
||||
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
int bStep = BLOCK_SIZE * wB;
|
||||
// Step size used to iterate through the sub-matrices of B
|
||||
int bStep = BLOCK_SIZE * wB;
|
||||
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
// Csub is used to store the element of the block sub-matrix
|
||||
// that is computed by the thread
|
||||
float Csub = 0;
|
||||
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||
// Loop over all the sub-matrices of A and B
|
||||
// required to compute the block sub-matrix
|
||||
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
|
||||
// Declaration of the shared memory array As used to
|
||||
// store the sub-matrix of A
|
||||
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||
// Declaration of the shared memory array Bs used to
|
||||
// store the sub-matrix of B
|
||||
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
|
||||
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
As[ty][tx] = A[a + wA * ty + tx];
|
||||
Bs[ty][tx] = B[b + wB * ty + tx];
|
||||
// Load the matrices from device memory
|
||||
// to shared memory; each thread loads
|
||||
// one element of each matrix
|
||||
As[ty][tx] = A[a + wA * ty + tx];
|
||||
Bs[ty][tx] = B[b + wB * ty + tx];
|
||||
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
cooperative_groups::sync(cta);
|
||||
// Synchronize to make sure the matrices are loaded
|
||||
cooperative_groups::sync(cta);
|
||||
|
||||
// Multiply the two matrices together;
|
||||
// each thread computes one element
|
||||
// of the block sub-matrix
|
||||
#pragma unroll
|
||||
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
||||
Csub += As[ty][k] * Bs[k][tx];
|
||||
for (int k = 0; k < BLOCK_SIZE; ++k) {
|
||||
Csub += As[ty][k] * Bs[k][tx];
|
||||
}
|
||||
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
cooperative_groups::sync(cta);
|
||||
}
|
||||
|
||||
// Synchronize to make sure that the preceding
|
||||
// computation is done before loading two new
|
||||
// sub-matrices of A and B in the next iteration
|
||||
cooperative_groups::sync(cta);
|
||||
}
|
||||
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
// Write the block sub-matrix to device memory;
|
||||
// each thread writes one element
|
||||
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
|
||||
C[c + wB * ty + tx] = Csub;
|
||||
}
|
||||
|
||||
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
|
||||
int wA, int wB) {
|
||||
matrixMulCUDA<16>(C, A, B, wA, wB);
|
||||
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
|
||||
{
|
||||
matrixMulCUDA<16>(C, A, B, wA, wB);
|
||||
}
|
||||
|
||||
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
|
||||
int wA, int wB) {
|
||||
matrixMulCUDA<32>(C, A, B, wA, wB);
|
||||
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
|
||||
{
|
||||
matrixMulCUDA<32>(C, A, B, wA, wB);
|
||||
}
|
||||
|
@ -28,252 +28,254 @@
|
||||
#include <cooperative_groups.h>
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
#include <helper_cuda.h>
|
||||
#include <assert.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
#include "mergeSort_common.h"
|
||||
|
||||
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
|
||||
uint &valB, uint arrowDir) {
|
||||
uint t;
|
||||
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
|
||||
{
|
||||
uint t;
|
||||
|
||||
if ((keyA > keyB) == arrowDir) {
|
||||
t = keyA;
|
||||
keyA = keyB;
|
||||
keyB = t;
|
||||
t = valA;
|
||||
valA = valB;
|
||||
valB = t;
|
||||
}
|
||||
if ((keyA > keyB) == arrowDir) {
|
||||
t = keyA;
|
||||
keyA = keyB;
|
||||
keyB = t;
|
||||
t = valA;
|
||||
valA = valB;
|
||||
valB = t;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint arrayLength, uint sortDir) {
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
// Shared memory storage for one or more short vectors
|
||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
||||
__global__ void
|
||||
bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
|
||||
{
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
// Shared memory storage for one or more short vectors
|
||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
||||
|
||||
// Offset to the beginning of subbatch and load data
|
||||
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
||||
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
||||
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||
// Offset to the beginning of subbatch and load data
|
||||
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||
|
||||
for (uint size = 2; size < arrayLength; size <<= 1) {
|
||||
// Bitonic merge
|
||||
uint dir = (threadIdx.x & (size / 2)) != 0;
|
||||
for (uint size = 2; size < arrayLength; size <<= 1) {
|
||||
// Bitonic merge
|
||||
uint dir = (threadIdx.x & (size / 2)) != 0;
|
||||
|
||||
for (uint stride = size / 2; stride > 0; stride >>= 1) {
|
||||
cg::sync(cta);
|
||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
|
||||
s_val[pos + stride], dir);
|
||||
for (uint stride = size / 2; stride > 0; stride >>= 1) {
|
||||
cg::sync(cta);
|
||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ddd == sortDir for the last bitonic merge step
|
||||
{
|
||||
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
|
||||
cg::sync(cta);
|
||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
|
||||
s_val[pos + stride], sortDir);
|
||||
// ddd == sortDir for the last bitonic merge step
|
||||
{
|
||||
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
|
||||
cg::sync(cta);
|
||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cg::sync(cta);
|
||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
|
||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
|
||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
cg::sync(cta);
|
||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
}
|
||||
|
||||
// Helper function (also used by odd-even merge sort)
|
||||
extern "C" uint factorRadix2(uint *log2L, uint L) {
|
||||
if (!L) {
|
||||
*log2L = 0;
|
||||
return 0;
|
||||
} else {
|
||||
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
|
||||
;
|
||||
extern "C" uint factorRadix2(uint *log2L, uint L)
|
||||
{
|
||||
if (!L) {
|
||||
*log2L = 0;
|
||||
return 0;
|
||||
}
|
||||
else {
|
||||
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
|
||||
;
|
||||
|
||||
return L;
|
||||
}
|
||||
return L;
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint batchSize, uint arrayLength,
|
||||
uint sortDir) {
|
||||
// Nothing to sort
|
||||
if (arrayLength < 2) {
|
||||
return;
|
||||
}
|
||||
extern "C" void bitonicSortShared(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint batchSize,
|
||||
uint arrayLength,
|
||||
uint sortDir)
|
||||
{
|
||||
// Nothing to sort
|
||||
if (arrayLength < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Only power-of-two array lengths are supported by this implementation
|
||||
uint log2L;
|
||||
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
|
||||
assert(factorizationRemainder == 1);
|
||||
// Only power-of-two array lengths are supported by this implementation
|
||||
uint log2L;
|
||||
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
|
||||
assert(factorizationRemainder == 1);
|
||||
|
||||
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||
|
||||
assert(arrayLength <= SHARED_SIZE_LIMIT);
|
||||
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
|
||||
assert(arrayLength <= SHARED_SIZE_LIMIT);
|
||||
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
|
||||
|
||||
bitonicSortSharedKernel<<<blockCount, threadCount>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
|
||||
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
|
||||
bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
|
||||
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 3: merge elementary intervals
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
|
||||
return ((a % b) == 0) ? (a / b) : (a / b + 1);
|
||||
}
|
||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
|
||||
|
||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
|
||||
return iDivUp(dividend, SAMPLE_STRIDE);
|
||||
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
|
||||
|
||||
template <uint sortDir>
|
||||
static inline __device__ void
|
||||
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
|
||||
{
|
||||
uint t;
|
||||
|
||||
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
|
||||
|| ((arrowDir != sortDir) && (flagB == 1))) {
|
||||
t = keyA;
|
||||
keyA = keyB;
|
||||
keyB = t;
|
||||
t = valA;
|
||||
valA = valB;
|
||||
valB = t;
|
||||
t = flagA;
|
||||
flagA = flagB;
|
||||
flagB = t;
|
||||
}
|
||||
}
|
||||
|
||||
template <uint sortDir>
|
||||
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
|
||||
uint &flagA, uint &keyB,
|
||||
uint &valB, uint &flagB,
|
||||
uint arrowDir) {
|
||||
uint t;
|
||||
__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint *d_LimitsA,
|
||||
uint *d_LimitsB,
|
||||
uint stride,
|
||||
uint N)
|
||||
{
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
||||
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
|
||||
|
||||
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
|
||||
((arrowDir == sortDir) && (flagA == 1)) ||
|
||||
((arrowDir != sortDir) && (flagB == 1))) {
|
||||
t = keyA;
|
||||
keyA = keyB;
|
||||
keyB = t;
|
||||
t = valA;
|
||||
valA = valB;
|
||||
valB = t;
|
||||
t = flagA;
|
||||
flagA = flagB;
|
||||
flagB = t;
|
||||
}
|
||||
}
|
||||
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
||||
d_SrcKey += segmentBase;
|
||||
d_SrcVal += segmentBase;
|
||||
d_DstKey += segmentBase;
|
||||
d_DstVal += segmentBase;
|
||||
|
||||
template <uint sortDir>
|
||||
__global__ void bitonicMergeElementaryIntervalsKernel(
|
||||
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
||||
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
|
||||
// Set up threadblock-wide parameters
|
||||
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
|
||||
|
||||
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
||||
d_SrcKey += segmentBase;
|
||||
d_SrcVal += segmentBase;
|
||||
d_DstKey += segmentBase;
|
||||
d_DstVal += segmentBase;
|
||||
if (threadIdx.x == 0) {
|
||||
uint segmentElementsA = stride;
|
||||
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
uint segmentSamplesA = stride / SAMPLE_STRIDE;
|
||||
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||
|
||||
// Set up threadblock-wide parameters
|
||||
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
|
||||
startSrcA = d_LimitsA[blockIdx.x];
|
||||
startSrcB = d_LimitsB[blockIdx.x];
|
||||
startDst = startSrcA + startSrcB;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
uint segmentElementsA = stride;
|
||||
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
uint segmentSamplesA = stride / SAMPLE_STRIDE;
|
||||
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
|
||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
|
||||
lenSrcA = endSrcA - startSrcA;
|
||||
lenSrcB = endSrcB - startSrcB;
|
||||
}
|
||||
|
||||
startSrcA = d_LimitsA[blockIdx.x];
|
||||
startSrcB = d_LimitsB[blockIdx.x];
|
||||
startDst = startSrcA + startSrcB;
|
||||
s_inf[threadIdx.x + 0] = 1;
|
||||
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
|
||||
|
||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
|
||||
: segmentElementsA;
|
||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
|
||||
: segmentElementsB;
|
||||
lenSrcA = endSrcA - startSrcA;
|
||||
lenSrcB = endSrcB - startSrcB;
|
||||
}
|
||||
|
||||
s_inf[threadIdx.x + 0] = 1;
|
||||
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
|
||||
|
||||
// Load input data
|
||||
cg::sync(cta);
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
||||
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
||||
s_inf[threadIdx.x] = 0;
|
||||
}
|
||||
|
||||
// Prepare for bitonic merge by inversing the ordering
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
|
||||
d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
|
||||
d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
|
||||
}
|
||||
|
||||
//"Extended" bitonic merge
|
||||
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
|
||||
// Load input data
|
||||
cg::sync(cta);
|
||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
|
||||
s_key[pos + stride], s_val[pos + stride],
|
||||
s_inf[pos + stride], sortDir);
|
||||
}
|
||||
|
||||
// Store sorted data
|
||||
cg::sync(cta);
|
||||
d_DstKey += startDst;
|
||||
d_DstVal += startDst;
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
||||
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
||||
s_inf[threadIdx.x] = 0;
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
|
||||
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
|
||||
}
|
||||
// Prepare for bitonic merge by inversing the ordering
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
||||
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
||||
}
|
||||
//"Extended" bitonic merge
|
||||
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
|
||||
cg::sync(cta);
|
||||
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
|
||||
ComparatorExtended<sortDir>(s_key[pos + 0],
|
||||
s_val[pos + 0],
|
||||
s_inf[pos + 0],
|
||||
s_key[pos + stride],
|
||||
s_val[pos + stride],
|
||||
s_inf[pos + stride],
|
||||
sortDir);
|
||||
}
|
||||
|
||||
// Store sorted data
|
||||
cg::sync(cta);
|
||||
d_DstKey += startDst;
|
||||
d_DstVal += startDst;
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
|
||||
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
||||
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint *d_LimitsA,
|
||||
uint *d_LimitsB, uint stride,
|
||||
uint N, uint sortDir) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint *d_LimitsB,
|
||||
uint stride,
|
||||
uint N,
|
||||
uint sortDir)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
|
||||
uint mergePairs = (lastSegmentElements > stride)
|
||||
? getSampleCount(N)
|
||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
|
||||
if (sortDir) {
|
||||
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
||||
N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||
} else {
|
||||
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
||||
N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||
}
|
||||
if (sortDir) {
|
||||
bitonicMergeElementaryIntervalsKernel<1U>
|
||||
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||
}
|
||||
else {
|
||||
bitonicMergeElementaryIntervalsKernel<0U>
|
||||
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||
}
|
||||
}
|
||||
|
@ -26,96 +26,94 @@
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#include <helper_functions.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_functions.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
#include "mergeSort_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Test driver
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
|
||||
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
|
||||
StopWatchInterface *hTimer = NULL;
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
|
||||
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
|
||||
StopWatchInterface *hTimer = NULL;
|
||||
|
||||
const uint N = 4 * 1048576;
|
||||
const uint DIR = 1;
|
||||
const uint numValues = 65536;
|
||||
const uint N = 4 * 1048576;
|
||||
const uint DIR = 1;
|
||||
const uint numValues = 65536;
|
||||
|
||||
printf("%s Starting...\n\n", argv[0]);
|
||||
printf("%s Starting...\n\n", argv[0]);
|
||||
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
if (dev == -1) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
if (dev == -1) {
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
printf("Allocating and initializing host arrays...\n\n");
|
||||
sdkCreateTimer(&hTimer);
|
||||
h_SrcKey = (uint *)malloc(N * sizeof(uint));
|
||||
h_SrcVal = (uint *)malloc(N * sizeof(uint));
|
||||
h_DstKey = (uint *)malloc(N * sizeof(uint));
|
||||
h_DstVal = (uint *)malloc(N * sizeof(uint));
|
||||
printf("Allocating and initializing host arrays...\n\n");
|
||||
sdkCreateTimer(&hTimer);
|
||||
h_SrcKey = (uint *)malloc(N * sizeof(uint));
|
||||
h_SrcVal = (uint *)malloc(N * sizeof(uint));
|
||||
h_DstKey = (uint *)malloc(N * sizeof(uint));
|
||||
h_DstVal = (uint *)malloc(N * sizeof(uint));
|
||||
|
||||
srand(2009);
|
||||
srand(2009);
|
||||
|
||||
for (uint i = 0; i < N; i++) {
|
||||
h_SrcKey[i] = rand() % numValues;
|
||||
}
|
||||
for (uint i = 0; i < N; i++) {
|
||||
h_SrcKey[i] = rand() % numValues;
|
||||
}
|
||||
|
||||
fillValues(h_SrcVal, N);
|
||||
fillValues(h_SrcVal, N);
|
||||
|
||||
printf("Allocating and initializing CUDA arrays...\n\n");
|
||||
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||
printf("Allocating and initializing CUDA arrays...\n\n");
|
||||
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
|
||||
checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||
checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
|
||||
|
||||
printf("Initializing GPU merge sort...\n");
|
||||
initMergeSort();
|
||||
printf("Initializing GPU merge sort...\n");
|
||||
initMergeSort();
|
||||
|
||||
printf("Running GPU merge sort...\n");
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkResetTimer(&hTimer);
|
||||
sdkStartTimer(&hTimer);
|
||||
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkStopTimer(&hTimer);
|
||||
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
|
||||
printf("Running GPU merge sort...\n");
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkResetTimer(&hTimer);
|
||||
sdkStartTimer(&hTimer);
|
||||
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
sdkStopTimer(&hTimer);
|
||||
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
|
||||
|
||||
printf("Reading back GPU merge sort results...\n");
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(
|
||||
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||
printf("Reading back GPU merge sort results...\n");
|
||||
checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||
checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
|
||||
|
||||
printf("Inspecting the results...\n");
|
||||
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
|
||||
printf("Inspecting the results...\n");
|
||||
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
|
||||
|
||||
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
|
||||
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
|
||||
|
||||
printf("Shutting down...\n");
|
||||
closeMergeSort();
|
||||
sdkDeleteTimer(&hTimer);
|
||||
checkCudaErrors(cudaFree(d_SrcVal));
|
||||
checkCudaErrors(cudaFree(d_SrcKey));
|
||||
checkCudaErrors(cudaFree(d_BufVal));
|
||||
checkCudaErrors(cudaFree(d_BufKey));
|
||||
checkCudaErrors(cudaFree(d_DstVal));
|
||||
checkCudaErrors(cudaFree(d_DstKey));
|
||||
free(h_DstVal);
|
||||
free(h_DstKey);
|
||||
free(h_SrcVal);
|
||||
free(h_SrcKey);
|
||||
printf("Shutting down...\n");
|
||||
closeMergeSort();
|
||||
sdkDeleteTimer(&hTimer);
|
||||
checkCudaErrors(cudaFree(d_SrcVal));
|
||||
checkCudaErrors(cudaFree(d_SrcKey));
|
||||
checkCudaErrors(cudaFree(d_BufVal));
|
||||
checkCudaErrors(cudaFree(d_BufKey));
|
||||
checkCudaErrors(cudaFree(d_DstVal));
|
||||
checkCudaErrors(cudaFree(d_DstKey));
|
||||
free(h_DstVal);
|
||||
free(h_DstKey);
|
||||
free(h_SrcVal);
|
||||
free(h_SrcKey);
|
||||
|
||||
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
@ -39,491 +39,499 @@
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
#include <helper_cuda.h>
|
||||
|
||||
#include "mergeSort_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
|
||||
return ((a % b) == 0) ? (a / b) : (a / b + 1);
|
||||
}
|
||||
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
|
||||
|
||||
static inline __host__ __device__ uint getSampleCount(uint dividend) {
|
||||
return iDivUp(dividend, SAMPLE_STRIDE);
|
||||
}
|
||||
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
|
||||
|
||||
#define W (sizeof(uint) * 8)
|
||||
static inline __device__ uint nextPowerOfTwo(uint x) {
|
||||
/*
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
*/
|
||||
return 1U << (W - __clz(x - 1));
|
||||
static inline __device__ uint nextPowerOfTwo(uint x)
|
||||
{
|
||||
/*
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
*/
|
||||
return 1U << (W - __clz(x - 1));
|
||||
}
|
||||
|
||||
template <uint sortDir>
|
||||
static inline __device__ uint binarySearchInclusive(uint val, uint *data,
|
||||
uint L, uint stride) {
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint pos = 0;
|
||||
|
||||
for (; stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] <= val)) ||
|
||||
(!sortDir && (data[newPos - 1] >= val))) {
|
||||
pos = newPos;
|
||||
template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
|
||||
{
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
uint pos = 0;
|
||||
|
||||
for (; stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
|
||||
pos = newPos;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
template <uint sortDir>
|
||||
static inline __device__ uint binarySearchExclusive(uint val, uint *data,
|
||||
uint L, uint stride) {
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint pos = 0;
|
||||
|
||||
for (; stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] < val)) ||
|
||||
(!sortDir && (data[newPos - 1] > val))) {
|
||||
pos = newPos;
|
||||
template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
|
||||
{
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
uint pos = 0;
|
||||
|
||||
for (; stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
|
||||
pos = newPos;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Bottom-level merge sort (binary search-based)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <uint sortDir>
|
||||
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint arrayLength) {
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
||||
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
|
||||
{
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ uint s_key[SHARED_SIZE_LIMIT];
|
||||
__shared__ uint s_val[SHARED_SIZE_LIMIT];
|
||||
|
||||
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
||||
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
|
||||
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
|
||||
s_key[threadIdx.x + 0] = d_SrcKey[0];
|
||||
s_val[threadIdx.x + 0] = d_SrcVal[0];
|
||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
|
||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
|
||||
|
||||
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
|
||||
uint lPos = threadIdx.x & (stride - 1);
|
||||
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
|
||||
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
|
||||
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
|
||||
uint lPos = threadIdx.x & (stride - 1);
|
||||
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
|
||||
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
|
||||
|
||||
cg::sync(cta);
|
||||
uint keyA = baseKey[lPos + 0];
|
||||
uint valA = baseVal[lPos + 0];
|
||||
uint keyB = baseKey[lPos + stride];
|
||||
uint valB = baseVal[lPos + stride];
|
||||
uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
|
||||
uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
|
||||
|
||||
cg::sync(cta);
|
||||
baseKey[posA] = keyA;
|
||||
baseVal[posA] = valA;
|
||||
baseKey[posB] = keyB;
|
||||
baseVal[posB] = valB;
|
||||
}
|
||||
|
||||
cg::sync(cta);
|
||||
uint keyA = baseKey[lPos + 0];
|
||||
uint valA = baseVal[lPos + 0];
|
||||
uint keyB = baseKey[lPos + stride];
|
||||
uint valB = baseVal[lPos + stride];
|
||||
uint posA =
|
||||
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
|
||||
lPos;
|
||||
uint posB =
|
||||
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
|
||||
lPos;
|
||||
|
||||
cg::sync(cta);
|
||||
baseKey[posA] = keyA;
|
||||
baseVal[posA] = valA;
|
||||
baseKey[posB] = keyB;
|
||||
baseVal[posB] = valB;
|
||||
}
|
||||
|
||||
cg::sync(cta);
|
||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
|
||||
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
|
||||
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
d_DstKey[0] = s_key[threadIdx.x + 0];
|
||||
d_DstVal[0] = s_val[threadIdx.x + 0];
|
||||
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
|
||||
}
|
||||
|
||||
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
|
||||
uint *d_SrcVal, uint batchSize, uint arrayLength,
|
||||
uint sortDir) {
|
||||
if (arrayLength < 2) {
|
||||
return;
|
||||
}
|
||||
static void mergeSortShared(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint batchSize,
|
||||
uint arrayLength,
|
||||
uint sortDir)
|
||||
{
|
||||
if (arrayLength < 2) {
|
||||
return;
|
||||
}
|
||||
|
||||
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
|
||||
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
|
||||
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
|
||||
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
|
||||
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
|
||||
uint threadCount = SHARED_SIZE_LIMIT / 2;
|
||||
|
||||
if (sortDir) {
|
||||
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
|
||||
} else {
|
||||
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
|
||||
}
|
||||
if (sortDir) {
|
||||
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
|
||||
}
|
||||
else {
|
||||
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
|
||||
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 1: generate sample ranks
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <uint sortDir>
|
||||
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
|
||||
uint *d_SrcKey, uint stride, uint N,
|
||||
uint threadCount) {
|
||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
__global__ void
|
||||
generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
|
||||
{
|
||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (pos >= threadCount) {
|
||||
return;
|
||||
}
|
||||
if (pos >= threadCount) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
d_SrcKey += segmentBase;
|
||||
d_RanksA += segmentBase / SAMPLE_STRIDE;
|
||||
d_RanksB += segmentBase / SAMPLE_STRIDE;
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
d_SrcKey += segmentBase;
|
||||
d_RanksA += segmentBase / SAMPLE_STRIDE;
|
||||
d_RanksB += segmentBase / SAMPLE_STRIDE;
|
||||
|
||||
const uint segmentElementsA = stride;
|
||||
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
const uint segmentElementsA = stride;
|
||||
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
|
||||
if (i < segmentSamplesA) {
|
||||
d_RanksA[i] = i * SAMPLE_STRIDE;
|
||||
d_RanksB[i] = binarySearchExclusive<sortDir>(
|
||||
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
|
||||
nextPowerOfTwo(segmentElementsB));
|
||||
}
|
||||
if (i < segmentSamplesA) {
|
||||
d_RanksA[i] = i * SAMPLE_STRIDE;
|
||||
d_RanksB[i] = binarySearchExclusive<sortDir>(
|
||||
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
|
||||
}
|
||||
|
||||
if (i < segmentSamplesB) {
|
||||
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
|
||||
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
|
||||
nextPowerOfTwo(segmentElementsA));
|
||||
}
|
||||
if (i < segmentSamplesB) {
|
||||
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
|
||||
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
|
||||
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
|
||||
}
|
||||
}
|
||||
|
||||
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
|
||||
uint stride, uint N, uint sortDir) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint threadCount =
|
||||
(lastSegmentElements > stride)
|
||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
if (sortDir) {
|
||||
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
|
||||
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
|
||||
} else {
|
||||
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
|
||||
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
|
||||
}
|
||||
if (sortDir) {
|
||||
generateSampleRanksKernel<1U>
|
||||
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
|
||||
}
|
||||
else {
|
||||
generateSampleRanksKernel<0U>
|
||||
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
|
||||
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 2: generate sample ranks and indices
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
|
||||
uint stride, uint N,
|
||||
uint threadCount) {
|
||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
|
||||
{
|
||||
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (pos >= threadCount) {
|
||||
return;
|
||||
}
|
||||
if (pos >= threadCount) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
d_Ranks += (pos - i) * 2;
|
||||
d_Limits += (pos - i) * 2;
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
d_Ranks += (pos - i) * 2;
|
||||
d_Limits += (pos - i) * 2;
|
||||
|
||||
const uint segmentElementsA = stride;
|
||||
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
const uint segmentElementsA = stride;
|
||||
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
const uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
const uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
|
||||
if (i < segmentSamplesA) {
|
||||
uint dstPos = binarySearchExclusive<1U>(
|
||||
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
|
||||
nextPowerOfTwo(segmentSamplesB)) +
|
||||
i;
|
||||
d_Limits[dstPos] = d_Ranks[i];
|
||||
}
|
||||
if (i < segmentSamplesA) {
|
||||
uint dstPos = binarySearchExclusive<1U>(
|
||||
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
|
||||
+ i;
|
||||
d_Limits[dstPos] = d_Ranks[i];
|
||||
}
|
||||
|
||||
if (i < segmentSamplesB) {
|
||||
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
|
||||
d_Ranks, segmentSamplesA,
|
||||
nextPowerOfTwo(segmentSamplesA)) +
|
||||
i;
|
||||
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
|
||||
}
|
||||
if (i < segmentSamplesB) {
|
||||
uint dstPos = binarySearchInclusive<1U>(
|
||||
d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
|
||||
+ i;
|
||||
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
|
||||
}
|
||||
}
|
||||
|
||||
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
|
||||
uint *d_RanksA, uint *d_RanksB, uint stride,
|
||||
uint N) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint threadCount =
|
||||
(lastSegmentElements > stride)
|
||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
|
||||
d_LimitsA, d_RanksA, stride, N, threadCount);
|
||||
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
|
||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
|
||||
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
|
||||
|
||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
|
||||
d_LimitsB, d_RanksB, stride, N, threadCount);
|
||||
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
|
||||
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
|
||||
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 3: merge elementary intervals
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
template <uint sortDir>
|
||||
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
|
||||
uint *srcAVal, uint *srcBKey, uint *srcBVal,
|
||||
uint lenA, uint nPowTwoLenA, uint lenB,
|
||||
uint nPowTwoLenB, cg::thread_block cta) {
|
||||
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
|
||||
inline __device__ void merge(uint *dstKey,
|
||||
uint *dstVal,
|
||||
uint *srcAKey,
|
||||
uint *srcAVal,
|
||||
uint *srcBKey,
|
||||
uint *srcBVal,
|
||||
uint lenA,
|
||||
uint nPowTwoLenA,
|
||||
uint lenB,
|
||||
uint nPowTwoLenB,
|
||||
cg::thread_block cta)
|
||||
{
|
||||
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
|
||||
|
||||
if (threadIdx.x < lenA) {
|
||||
keyA = srcAKey[threadIdx.x];
|
||||
valA = srcAVal[threadIdx.x];
|
||||
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
|
||||
threadIdx.x;
|
||||
}
|
||||
if (threadIdx.x < lenA) {
|
||||
keyA = srcAKey[threadIdx.x];
|
||||
valA = srcAVal[threadIdx.x];
|
||||
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenB) {
|
||||
keyB = srcBKey[threadIdx.x];
|
||||
valB = srcBVal[threadIdx.x];
|
||||
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
|
||||
threadIdx.x;
|
||||
}
|
||||
if (threadIdx.x < lenB) {
|
||||
keyB = srcBKey[threadIdx.x];
|
||||
valB = srcBVal[threadIdx.x];
|
||||
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
|
||||
}
|
||||
|
||||
cg::sync(cta);
|
||||
cg::sync(cta);
|
||||
|
||||
if (threadIdx.x < lenA) {
|
||||
dstKey[dstPosA] = keyA;
|
||||
dstVal[dstPosA] = valA;
|
||||
}
|
||||
if (threadIdx.x < lenA) {
|
||||
dstKey[dstPosA] = keyA;
|
||||
dstVal[dstPosA] = valA;
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenB) {
|
||||
dstKey[dstPosB] = keyB;
|
||||
dstVal[dstPosB] = valB;
|
||||
}
|
||||
if (threadIdx.x < lenB) {
|
||||
dstKey[dstPosB] = keyB;
|
||||
dstVal[dstPosB] = valB;
|
||||
}
|
||||
}
|
||||
|
||||
template <uint sortDir>
|
||||
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint *d_LimitsA, uint *d_LimitsB,
|
||||
uint stride, uint N) {
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
||||
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint *d_LimitsA,
|
||||
uint *d_LimitsB,
|
||||
uint stride,
|
||||
uint N)
|
||||
{
|
||||
// Handle to thread block group
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
__shared__ uint s_key[2 * SAMPLE_STRIDE];
|
||||
__shared__ uint s_val[2 * SAMPLE_STRIDE];
|
||||
|
||||
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
||||
d_SrcKey += segmentBase;
|
||||
d_SrcVal += segmentBase;
|
||||
d_DstKey += segmentBase;
|
||||
d_DstVal += segmentBase;
|
||||
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
|
||||
d_SrcKey += segmentBase;
|
||||
d_SrcVal += segmentBase;
|
||||
d_DstKey += segmentBase;
|
||||
d_DstVal += segmentBase;
|
||||
|
||||
// Set up threadblock-wide parameters
|
||||
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
|
||||
// Set up threadblock-wide parameters
|
||||
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
uint segmentElementsA = stride;
|
||||
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||
if (threadIdx.x == 0) {
|
||||
uint segmentElementsA = stride;
|
||||
uint segmentElementsB = umin(stride, N - segmentBase - stride);
|
||||
uint segmentSamplesA = getSampleCount(segmentElementsA);
|
||||
uint segmentSamplesB = getSampleCount(segmentElementsB);
|
||||
uint segmentSamples = segmentSamplesA + segmentSamplesB;
|
||||
|
||||
startSrcA = d_LimitsA[blockIdx.x];
|
||||
startSrcB = d_LimitsB[blockIdx.x];
|
||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
|
||||
: segmentElementsA;
|
||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
|
||||
: segmentElementsB;
|
||||
lenSrcA = endSrcA - startSrcA;
|
||||
lenSrcB = endSrcB - startSrcB;
|
||||
startDstA = startSrcA + startSrcB;
|
||||
startDstB = startDstA + lenSrcA;
|
||||
}
|
||||
|
||||
// Load main input data
|
||||
cg::sync(cta);
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
||||
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
s_key[threadIdx.x + SAMPLE_STRIDE] =
|
||||
d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||
s_val[threadIdx.x + SAMPLE_STRIDE] =
|
||||
d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||
}
|
||||
|
||||
// Merge data in shared memory
|
||||
cg::sync(cta);
|
||||
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
|
||||
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
|
||||
SAMPLE_STRIDE, cta);
|
||||
|
||||
// Store merged data
|
||||
cg::sync(cta);
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
|
||||
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
||||
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint *d_LimitsA, uint *d_LimitsB,
|
||||
uint stride, uint N, uint sortDir) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint mergePairs = (lastSegmentElements > stride)
|
||||
? getSampleCount(N)
|
||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
|
||||
if (sortDir) {
|
||||
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
||||
N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||
} else {
|
||||
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
|
||||
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
|
||||
N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint batchSize, uint arrayLength,
|
||||
uint sortDir);
|
||||
|
||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
|
||||
uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint *d_LimitsA,
|
||||
uint *d_LimitsB, uint stride,
|
||||
uint N, uint sortDir);
|
||||
|
||||
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
|
||||
static const uint MAX_SAMPLE_COUNT = 32768;
|
||||
|
||||
extern "C" void initMergeSort(void) {
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
checkCudaErrors(
|
||||
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
}
|
||||
|
||||
extern "C" void closeMergeSort(void) {
|
||||
checkCudaErrors(cudaFree(d_RanksA));
|
||||
checkCudaErrors(cudaFree(d_RanksB));
|
||||
checkCudaErrors(cudaFree(d_LimitsB));
|
||||
checkCudaErrors(cudaFree(d_LimitsA));
|
||||
}
|
||||
|
||||
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
|
||||
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
|
||||
uint N, uint sortDir) {
|
||||
uint stageCount = 0;
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||
;
|
||||
|
||||
uint *ikey, *ival, *okey, *oval;
|
||||
|
||||
if (stageCount & 1) {
|
||||
ikey = d_BufKey;
|
||||
ival = d_BufVal;
|
||||
okey = d_DstKey;
|
||||
oval = d_DstVal;
|
||||
} else {
|
||||
ikey = d_DstKey;
|
||||
ival = d_DstVal;
|
||||
okey = d_BufKey;
|
||||
oval = d_BufVal;
|
||||
}
|
||||
|
||||
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
|
||||
assert(N % SHARED_SIZE_LIMIT == 0);
|
||||
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
|
||||
SHARED_SIZE_LIMIT, sortDir);
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
|
||||
// Find sample ranks and prepare for limiters merge
|
||||
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
|
||||
|
||||
// Merge ranks and indices
|
||||
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
|
||||
|
||||
// Merge elementary intervals
|
||||
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
|
||||
stride, N, sortDir);
|
||||
|
||||
if (lastSegmentElements <= stride) {
|
||||
// Last merge segment consists of a single array which just needs to be
|
||||
// passed through
|
||||
checkCudaErrors(cudaMemcpy(
|
||||
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
|
||||
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
|
||||
checkCudaErrors(cudaMemcpy(
|
||||
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
|
||||
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
|
||||
startSrcA = d_LimitsA[blockIdx.x];
|
||||
startSrcB = d_LimitsB[blockIdx.x];
|
||||
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
|
||||
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
|
||||
lenSrcA = endSrcA - startSrcA;
|
||||
lenSrcB = endSrcB - startSrcB;
|
||||
startDstA = startSrcA + startSrcB;
|
||||
startDstB = startDstA + lenSrcA;
|
||||
}
|
||||
|
||||
uint *t;
|
||||
t = ikey;
|
||||
ikey = okey;
|
||||
okey = t;
|
||||
t = ival;
|
||||
ival = oval;
|
||||
oval = t;
|
||||
}
|
||||
// Load main input data
|
||||
cg::sync(cta);
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
|
||||
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
|
||||
s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
|
||||
}
|
||||
|
||||
// Merge data in shared memory
|
||||
cg::sync(cta);
|
||||
merge<sortDir>(s_key,
|
||||
s_val,
|
||||
s_key + 0,
|
||||
s_val + 0,
|
||||
s_key + SAMPLE_STRIDE,
|
||||
s_val + SAMPLE_STRIDE,
|
||||
lenSrcA,
|
||||
SAMPLE_STRIDE,
|
||||
lenSrcB,
|
||||
SAMPLE_STRIDE,
|
||||
cta);
|
||||
|
||||
// Store merged data
|
||||
cg::sync(cta);
|
||||
|
||||
if (threadIdx.x < lenSrcA) {
|
||||
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
|
||||
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
|
||||
}
|
||||
|
||||
if (threadIdx.x < lenSrcB) {
|
||||
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
|
||||
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
static void mergeElementaryIntervals(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint *d_LimitsA,
|
||||
uint *d_LimitsB,
|
||||
uint stride,
|
||||
uint N,
|
||||
uint sortDir)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
|
||||
if (sortDir) {
|
||||
mergeElementaryIntervalsKernel<1U>
|
||||
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
|
||||
}
|
||||
else {
|
||||
mergeElementaryIntervalsKernel<0U>
|
||||
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
|
||||
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void bitonicSortShared(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint batchSize,
|
||||
uint arrayLength,
|
||||
uint sortDir);
|
||||
|
||||
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint *d_LimitsA,
|
||||
uint *d_LimitsB,
|
||||
uint stride,
|
||||
uint N,
|
||||
uint sortDir);
|
||||
|
||||
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
|
||||
static const uint MAX_SAMPLE_COUNT = 32768;
|
||||
|
||||
extern "C" void initMergeSort(void)
|
||||
{
|
||||
checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
|
||||
}
|
||||
|
||||
extern "C" void closeMergeSort(void)
|
||||
{
|
||||
checkCudaErrors(cudaFree(d_RanksA));
|
||||
checkCudaErrors(cudaFree(d_RanksB));
|
||||
checkCudaErrors(cudaFree(d_LimitsB));
|
||||
checkCudaErrors(cudaFree(d_LimitsA));
|
||||
}
|
||||
|
||||
extern "C" void mergeSort(uint *d_DstKey,
|
||||
uint *d_DstVal,
|
||||
uint *d_BufKey,
|
||||
uint *d_BufVal,
|
||||
uint *d_SrcKey,
|
||||
uint *d_SrcVal,
|
||||
uint N,
|
||||
uint sortDir)
|
||||
{
|
||||
uint stageCount = 0;
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||
;
|
||||
|
||||
uint *ikey, *ival, *okey, *oval;
|
||||
|
||||
if (stageCount & 1) {
|
||||
ikey = d_BufKey;
|
||||
ival = d_BufVal;
|
||||
okey = d_DstKey;
|
||||
oval = d_DstVal;
|
||||
}
|
||||
else {
|
||||
ikey = d_DstKey;
|
||||
ival = d_DstVal;
|
||||
okey = d_BufKey;
|
||||
oval = d_BufVal;
|
||||
}
|
||||
|
||||
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
|
||||
assert(N % SHARED_SIZE_LIMIT == 0);
|
||||
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
|
||||
// Find sample ranks and prepare for limiters merge
|
||||
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
|
||||
|
||||
// Merge ranks and indices
|
||||
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
|
||||
|
||||
// Merge elementary intervals
|
||||
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
|
||||
|
||||
if (lastSegmentElements <= stride) {
|
||||
// Last merge segment consists of a single array which just needs to be
|
||||
// passed through
|
||||
checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
|
||||
ikey + (N - lastSegmentElements),
|
||||
lastSegmentElements * sizeof(uint),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
|
||||
ival + (N - lastSegmentElements),
|
||||
lastSegmentElements * sizeof(uint),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
uint *t;
|
||||
t = ikey;
|
||||
ikey = okey;
|
||||
okey = t;
|
||||
t = ival;
|
||||
ival = oval;
|
||||
oval = t;
|
||||
}
|
||||
}
|
||||
|
@ -31,19 +31,17 @@
|
||||
typedef unsigned int uint;
|
||||
|
||||
#define SHARED_SIZE_LIMIT 1024U
|
||||
#define SAMPLE_STRIDE 128
|
||||
#define SAMPLE_STRIDE 128
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Extensive sort validation routine
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
||||
uint arrayLength, uint numValues,
|
||||
uint sortDir);
|
||||
extern "C" uint
|
||||
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
|
||||
|
||||
extern "C" void fillValues(uint *val, uint N);
|
||||
|
||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
|
||||
uint batchSize, uint arrayLength);
|
||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// CUDA merge sort
|
||||
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
|
||||
|
||||
extern "C" void closeMergeSort(void);
|
||||
|
||||
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
|
||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
||||
uint sortDir);
|
||||
extern "C" void
|
||||
mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// CPU "emulation"
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
||||
uint sortDir);
|
||||
extern "C" void
|
||||
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
|
||||
|
@ -29,329 +29,335 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "mergeSort_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static void checkOrder(uint *data, uint N, uint sortDir) {
|
||||
if (N <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < N - 1; i++)
|
||||
if ((sortDir && (data[i] > data[i + 1])) ||
|
||||
(!sortDir && (data[i] < data[i + 1]))) {
|
||||
fprintf(stderr, "checkOrder() failed!!!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
static void checkOrder(uint *data, uint N, uint sortDir)
|
||||
{
|
||||
if (N <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint i = 0; i < N - 1; i++)
|
||||
if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
|
||||
fprintf(stderr, "checkOrder() failed!!!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
|
||||
|
||||
static uint getSampleCount(uint dividend) {
|
||||
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
|
||||
: (dividend / SAMPLE_STRIDE);
|
||||
static uint getSampleCount(uint dividend)
|
||||
{
|
||||
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
|
||||
}
|
||||
|
||||
static uint nextPowerOfTwo(uint x) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
static uint nextPowerOfTwo(uint x)
|
||||
{
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
}
|
||||
|
||||
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint pos = 0;
|
||||
|
||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] <= val)) ||
|
||||
(!sortDir && (data[newPos - 1] >= val))) {
|
||||
pos = newPos;
|
||||
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
|
||||
{
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
uint pos = 0;
|
||||
|
||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
|
||||
pos = newPos;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint pos = 0;
|
||||
|
||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] < val)) ||
|
||||
(!sortDir && (data[newPos - 1] > val))) {
|
||||
pos = newPos;
|
||||
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
|
||||
{
|
||||
if (L == 0) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
uint pos = 0;
|
||||
|
||||
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
|
||||
uint newPos = umin(pos + stride, L);
|
||||
|
||||
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
|
||||
pos = newPos;
|
||||
}
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 1: find sample ranks in each segment
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
|
||||
uint stride, uint N, uint sortDir) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint sampleCount =
|
||||
(lastSegmentElements > stride)
|
||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
|
||||
const uint lenA = stride;
|
||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||
const uint nA = stride / SAMPLE_STRIDE;
|
||||
const uint nB = getSampleCount(lenB);
|
||||
const uint lenA = stride;
|
||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||
const uint nA = stride / SAMPLE_STRIDE;
|
||||
const uint nB = getSampleCount(lenB);
|
||||
|
||||
if (i < nA) {
|
||||
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
|
||||
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
|
||||
srcKey + segmentBase + stride, lenB, sortDir);
|
||||
if (i < nA) {
|
||||
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
|
||||
srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
|
||||
}
|
||||
|
||||
if (i < nB) {
|
||||
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
|
||||
srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
|
||||
}
|
||||
}
|
||||
|
||||
if (i < nB) {
|
||||
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
|
||||
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
|
||||
binarySearchInclusive(
|
||||
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
|
||||
srcKey + segmentBase, lenA, sortDir);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 2: merge ranks and indices to derive elementary intervals
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
|
||||
uint N) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint sampleCount =
|
||||
(lastSegmentElements > stride)
|
||||
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
|
||||
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
|
||||
|
||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
for (uint pos = 0; pos < sampleCount; pos++) {
|
||||
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
|
||||
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
|
||||
|
||||
const uint lenA = stride;
|
||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||
const uint nA = stride / SAMPLE_STRIDE;
|
||||
const uint nB = getSampleCount(lenB);
|
||||
const uint lenA = stride;
|
||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||
const uint nA = stride / SAMPLE_STRIDE;
|
||||
const uint nB = getSampleCount(lenB);
|
||||
|
||||
if (i < nA) {
|
||||
uint dstPosA =
|
||||
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
|
||||
ranks + (segmentBase + stride) / SAMPLE_STRIDE,
|
||||
nB, 1) +
|
||||
i;
|
||||
assert(dstPosA < nA + nB);
|
||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
|
||||
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
|
||||
if (i < nA) {
|
||||
uint dstPosA =
|
||||
binarySearchExclusive(
|
||||
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
|
||||
+ i;
|
||||
assert(dstPosA < nA + nB);
|
||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
|
||||
}
|
||||
|
||||
if (i < nB) {
|
||||
uint dstPosA =
|
||||
binarySearchInclusive(
|
||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
|
||||
+ i;
|
||||
assert(dstPosA < nA + nB);
|
||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
|
||||
}
|
||||
}
|
||||
|
||||
if (i < nB) {
|
||||
uint dstPosA = binarySearchInclusive(
|
||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
|
||||
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
|
||||
i;
|
||||
assert(dstPosA < nA + nB);
|
||||
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
|
||||
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
|
||||
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
|
||||
uint sortDir) {
|
||||
checkOrder(srcAKey, lenA, sortDir);
|
||||
checkOrder(srcBKey, lenB, sortDir);
|
||||
static void merge(uint *dstKey,
|
||||
uint *dstVal,
|
||||
uint *srcAKey,
|
||||
uint *srcAVal,
|
||||
uint *srcBKey,
|
||||
uint *srcBVal,
|
||||
uint lenA,
|
||||
uint lenB,
|
||||
uint sortDir)
|
||||
{
|
||||
checkOrder(srcAKey, lenA, sortDir);
|
||||
checkOrder(srcBKey, lenB, sortDir);
|
||||
|
||||
for (uint i = 0; i < lenA; i++) {
|
||||
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
|
||||
assert(dstPos < lenA + lenB);
|
||||
dstKey[dstPos] = srcAKey[i];
|
||||
dstVal[dstPos] = srcAVal[i];
|
||||
}
|
||||
for (uint i = 0; i < lenA; i++) {
|
||||
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
|
||||
assert(dstPos < lenA + lenB);
|
||||
dstKey[dstPos] = srcAKey[i];
|
||||
dstVal[dstPos] = srcAVal[i];
|
||||
}
|
||||
|
||||
for (uint i = 0; i < lenB; i++) {
|
||||
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
|
||||
assert(dstPos < lenA + lenB);
|
||||
dstKey[dstPos] = srcBKey[i];
|
||||
dstVal[dstPos] = srcBVal[i];
|
||||
}
|
||||
for (uint i = 0; i < lenB; i++) {
|
||||
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
|
||||
assert(dstPos < lenA + lenB);
|
||||
dstKey[dstPos] = srcBKey[i];
|
||||
dstVal[dstPos] = srcBVal[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
|
||||
uint *srcVal, uint *limitsA, uint *limitsB,
|
||||
uint stride, uint N, uint sortDir) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint mergePairs = (lastSegmentElements > stride)
|
||||
? getSampleCount(N)
|
||||
: (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
static void mergeElementaryIntervals(uint *dstKey,
|
||||
uint *dstVal,
|
||||
uint *srcKey,
|
||||
uint *srcVal,
|
||||
uint *limitsA,
|
||||
uint *limitsB,
|
||||
uint stride,
|
||||
uint N,
|
||||
uint sortDir)
|
||||
{
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
|
||||
|
||||
for (uint pos = 0; pos < mergePairs; pos++) {
|
||||
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
|
||||
for (uint pos = 0; pos < mergePairs; pos++) {
|
||||
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
|
||||
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
|
||||
|
||||
const uint lenA = stride;
|
||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||
const uint nA = stride / SAMPLE_STRIDE;
|
||||
const uint nB = getSampleCount(lenB);
|
||||
const uint n = nA + nB;
|
||||
const uint lenA = stride;
|
||||
const uint lenB = umin(stride, N - segmentBase - stride);
|
||||
const uint nA = stride / SAMPLE_STRIDE;
|
||||
const uint nB = getSampleCount(lenB);
|
||||
const uint n = nA + nB;
|
||||
|
||||
const uint startPosA = limitsA[pos];
|
||||
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
|
||||
const uint startPosB = limitsB[pos];
|
||||
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
|
||||
const uint startPosDst = startPosA + startPosB;
|
||||
const uint startPosA = limitsA[pos];
|
||||
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
|
||||
const uint startPosB = limitsB[pos];
|
||||
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
|
||||
const uint startPosDst = startPosA + startPosB;
|
||||
|
||||
assert(startPosA <= endPosA && endPosA <= lenA);
|
||||
assert(startPosB <= endPosB && endPosB <= lenB);
|
||||
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
|
||||
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
|
||||
assert(startPosA <= endPosA && endPosA <= lenA);
|
||||
assert(startPosB <= endPosB && endPosB <= lenB);
|
||||
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
|
||||
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
|
||||
|
||||
merge(dstKey + segmentBase + startPosDst,
|
||||
dstVal + segmentBase + startPosDst,
|
||||
(srcKey + segmentBase + 0) + startPosA,
|
||||
(srcVal + segmentBase + 0) + startPosA,
|
||||
(srcKey + segmentBase + stride) + startPosB,
|
||||
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
|
||||
endPosB - startPosB, sortDir);
|
||||
}
|
||||
merge(dstKey + segmentBase + startPosDst,
|
||||
dstVal + segmentBase + startPosDst,
|
||||
(srcKey + segmentBase + 0) + startPosA,
|
||||
(srcVal + segmentBase + 0) + startPosA,
|
||||
(srcKey + segmentBase + stride) + startPosB,
|
||||
(srcVal + segmentBase + stride) + startPosB,
|
||||
endPosA - startPosA,
|
||||
endPosB - startPosB,
|
||||
sortDir);
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Retarded bubble sort
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
|
||||
if (N <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint bottom = 0; bottom < N - 1; bottom++) {
|
||||
uint savePos = bottom;
|
||||
uint saveKey = key[bottom];
|
||||
|
||||
for (uint i = bottom + 1; i < N; i++)
|
||||
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
|
||||
savePos = i;
|
||||
saveKey = key[i];
|
||||
}
|
||||
|
||||
if (savePos != bottom) {
|
||||
uint t;
|
||||
t = key[savePos];
|
||||
key[savePos] = key[bottom];
|
||||
key[bottom] = t;
|
||||
t = val[savePos];
|
||||
val[savePos] = val[bottom];
|
||||
val[bottom] = t;
|
||||
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
|
||||
{
|
||||
if (N <= 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint bottom = 0; bottom < N - 1; bottom++) {
|
||||
uint savePos = bottom;
|
||||
uint saveKey = key[bottom];
|
||||
|
||||
for (uint i = bottom + 1; i < N; i++)
|
||||
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
|
||||
savePos = i;
|
||||
saveKey = key[i];
|
||||
}
|
||||
|
||||
if (savePos != bottom) {
|
||||
uint t;
|
||||
t = key[savePos];
|
||||
key[savePos] = key[bottom];
|
||||
key[bottom] = t;
|
||||
t = val[savePos];
|
||||
val[savePos] = val[bottom];
|
||||
val[bottom] = t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Interface function
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
|
||||
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
|
||||
uint sortDir) {
|
||||
uint *ikey, *ival, *okey, *oval;
|
||||
uint stageCount = 0;
|
||||
extern "C" void
|
||||
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
|
||||
{
|
||||
uint *ikey, *ival, *okey, *oval;
|
||||
uint stageCount = 0;
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||
;
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
|
||||
;
|
||||
|
||||
if (stageCount & 1) {
|
||||
ikey = bufKey;
|
||||
ival = bufVal;
|
||||
okey = dstKey;
|
||||
oval = dstVal;
|
||||
} else {
|
||||
ikey = dstKey;
|
||||
ival = dstVal;
|
||||
okey = bufKey;
|
||||
oval = bufVal;
|
||||
}
|
||||
|
||||
printf("Bottom-level sort...\n");
|
||||
memcpy(ikey, srcKey, N * sizeof(uint));
|
||||
memcpy(ival, srcVal, N * sizeof(uint));
|
||||
|
||||
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
|
||||
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
|
||||
sortDir);
|
||||
}
|
||||
|
||||
printf("Merge...\n");
|
||||
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
|
||||
// Find sample ranks and prepare for limiters merge
|
||||
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
|
||||
|
||||
// Merge ranks and indices
|
||||
mergeRanksAndIndices(limitsA, ranksA, stride, N);
|
||||
mergeRanksAndIndices(limitsB, ranksB, stride, N);
|
||||
|
||||
// Merge elementary intervals
|
||||
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
|
||||
N, sortDir);
|
||||
|
||||
if (lastSegmentElements <= stride) {
|
||||
// Last merge segment consists of a single array which just needs to be
|
||||
// passed through
|
||||
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
|
||||
lastSegmentElements * sizeof(uint));
|
||||
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
|
||||
lastSegmentElements * sizeof(uint));
|
||||
if (stageCount & 1) {
|
||||
ikey = bufKey;
|
||||
ival = bufVal;
|
||||
okey = dstKey;
|
||||
oval = dstVal;
|
||||
}
|
||||
else {
|
||||
ikey = dstKey;
|
||||
ival = dstVal;
|
||||
okey = bufKey;
|
||||
oval = bufVal;
|
||||
}
|
||||
|
||||
uint *t;
|
||||
t = ikey;
|
||||
ikey = okey;
|
||||
okey = t;
|
||||
t = ival;
|
||||
ival = oval;
|
||||
oval = t;
|
||||
}
|
||||
printf("Bottom-level sort...\n");
|
||||
memcpy(ikey, srcKey, N * sizeof(uint));
|
||||
memcpy(ival, srcVal, N * sizeof(uint));
|
||||
|
||||
free(limitsB);
|
||||
free(limitsA);
|
||||
free(ranksB);
|
||||
free(ranksA);
|
||||
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
|
||||
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
|
||||
}
|
||||
|
||||
printf("Merge...\n");
|
||||
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
|
||||
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
|
||||
|
||||
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
|
||||
uint lastSegmentElements = N % (2 * stride);
|
||||
|
||||
// Find sample ranks and prepare for limiters merge
|
||||
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
|
||||
|
||||
// Merge ranks and indices
|
||||
mergeRanksAndIndices(limitsA, ranksA, stride, N);
|
||||
mergeRanksAndIndices(limitsB, ranksB, stride, N);
|
||||
|
||||
// Merge elementary intervals
|
||||
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
|
||||
|
||||
if (lastSegmentElements <= stride) {
|
||||
// Last merge segment consists of a single array which just needs to be
|
||||
// passed through
|
||||
memcpy(
|
||||
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
|
||||
memcpy(
|
||||
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
|
||||
}
|
||||
|
||||
uint *t;
|
||||
t = ikey;
|
||||
ikey = okey;
|
||||
okey = t;
|
||||
t = ival;
|
||||
ival = oval;
|
||||
oval = t;
|
||||
}
|
||||
|
||||
free(limitsB);
|
||||
free(limitsA);
|
||||
free(ranksB);
|
||||
free(ranksA);
|
||||
}
|
||||
|
@ -29,104 +29,100 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "mergeSort_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Validate sorted keys array (check for integrity and proper order)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
|
||||
uint arrayLength, uint numValues,
|
||||
uint sortDir) {
|
||||
uint *srcHist;
|
||||
uint *resHist;
|
||||
extern "C" uint
|
||||
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
|
||||
{
|
||||
uint *srcHist;
|
||||
uint *resHist;
|
||||
|
||||
if (arrayLength < 2) {
|
||||
printf("validateSortedKeys(): arrays too short, exiting...\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
printf("...inspecting keys array: ");
|
||||
srcHist = (uint *)malloc(numValues * sizeof(uint));
|
||||
resHist = (uint *)malloc(numValues * sizeof(uint));
|
||||
|
||||
int flag = 1;
|
||||
|
||||
for (uint j = 0; j < batchSize;
|
||||
j++, srcKey += arrayLength, resKey += arrayLength) {
|
||||
// Build histograms for keys arrays
|
||||
memset(srcHist, 0, numValues * sizeof(uint));
|
||||
memset(resHist, 0, numValues * sizeof(uint));
|
||||
|
||||
for (uint i = 0; i < arrayLength; i++) {
|
||||
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
|
||||
srcHist[srcKey[i]]++;
|
||||
resHist[resKey[i]]++;
|
||||
} else {
|
||||
fprintf(
|
||||
stderr,
|
||||
"***Set %u source/result key arrays are not limited properly***\n",
|
||||
j);
|
||||
flag = 0;
|
||||
goto brk;
|
||||
}
|
||||
if (arrayLength < 2) {
|
||||
printf("validateSortedKeys(): arrays too short, exiting...\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Compare the histograms
|
||||
for (uint i = 0; i < numValues; i++)
|
||||
if (srcHist[i] != resHist[i]) {
|
||||
fprintf(stderr,
|
||||
"***Set %u source/result keys histograms do not match***\n", j);
|
||||
flag = 0;
|
||||
goto brk;
|
||||
}
|
||||
printf("...inspecting keys array: ");
|
||||
srcHist = (uint *)malloc(numValues * sizeof(uint));
|
||||
resHist = (uint *)malloc(numValues * sizeof(uint));
|
||||
|
||||
// Finally check the ordering
|
||||
for (uint i = 0; i < arrayLength - 1; i++)
|
||||
if ((sortDir && (resKey[i] > resKey[i + 1])) ||
|
||||
(!sortDir && (resKey[i] < resKey[i + 1]))) {
|
||||
fprintf(stderr,
|
||||
"***Set %u result key array is not ordered properly***\n", j);
|
||||
flag = 0;
|
||||
goto brk;
|
||||
}
|
||||
}
|
||||
int flag = 1;
|
||||
|
||||
for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
|
||||
// Build histograms for keys arrays
|
||||
memset(srcHist, 0, numValues * sizeof(uint));
|
||||
memset(resHist, 0, numValues * sizeof(uint));
|
||||
|
||||
for (uint i = 0; i < arrayLength; i++) {
|
||||
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
|
||||
srcHist[srcKey[i]]++;
|
||||
resHist[resKey[i]]++;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
|
||||
flag = 0;
|
||||
goto brk;
|
||||
}
|
||||
}
|
||||
|
||||
// Compare the histograms
|
||||
for (uint i = 0; i < numValues; i++)
|
||||
if (srcHist[i] != resHist[i]) {
|
||||
fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
|
||||
flag = 0;
|
||||
goto brk;
|
||||
}
|
||||
|
||||
// Finally check the ordering
|
||||
for (uint i = 0; i < arrayLength - 1; i++)
|
||||
if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
|
||||
fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
|
||||
flag = 0;
|
||||
goto brk;
|
||||
}
|
||||
}
|
||||
|
||||
brk:
|
||||
free(resHist);
|
||||
free(srcHist);
|
||||
free(resHist);
|
||||
free(srcHist);
|
||||
|
||||
if (flag) printf("OK\n");
|
||||
if (flag)
|
||||
printf("OK\n");
|
||||
|
||||
return flag;
|
||||
return flag;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Value validation / stability check routines
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void fillValues(uint *val, uint N) {
|
||||
for (uint i = 0; i < N; i++) val[i] = i;
|
||||
extern "C" void fillValues(uint *val, uint N)
|
||||
{
|
||||
for (uint i = 0; i < N; i++)
|
||||
val[i] = i;
|
||||
}
|
||||
|
||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
|
||||
uint batchSize, uint arrayLength) {
|
||||
int correctFlag = 1, stableFlag = 1;
|
||||
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
|
||||
{
|
||||
int correctFlag = 1, stableFlag = 1;
|
||||
|
||||
printf("...inspecting keys and values array: ");
|
||||
printf("...inspecting keys and values array: ");
|
||||
|
||||
for (uint i = 0; i < batchSize;
|
||||
i++, resKey += arrayLength, resVal += arrayLength) {
|
||||
for (uint j = 0; j < arrayLength; j++) {
|
||||
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
|
||||
for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
|
||||
for (uint j = 0; j < arrayLength; j++) {
|
||||
if (resKey[j] != srcKey[resVal[j]])
|
||||
correctFlag = 0;
|
||||
|
||||
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
|
||||
(resVal[j] > resVal[j + 1]))
|
||||
stableFlag = 0;
|
||||
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
|
||||
stableFlag = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
|
||||
printf(stableFlag ? "...stability property: stable!\n"
|
||||
: "...stability property: NOT stable\n");
|
||||
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
|
||||
printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
|
||||
|
||||
return correctFlag;
|
||||
return correctFlag;
|
||||
}
|
||||
|
@ -29,106 +29,105 @@
|
||||
#include <stdio.h>
|
||||
|
||||
// Includes CUDA
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda/barrier>
|
||||
#include <cooperative_groups.h>
|
||||
#include <cuda/barrier>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Utilities and timing functions
|
||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||
|
||||
// CUDA helper functions
|
||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
template <bool writeSquareRoot>
|
||||
__device__ void reduceBlockData(
|
||||
cuda::barrier<cuda::thread_scope_block> &barrier,
|
||||
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
|
||||
extern __shared__ double tmp[];
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
||||
threadSum += tile32.shfl_down(threadSum, offset);
|
||||
}
|
||||
if (tile32.thread_rank() == 0) {
|
||||
tmp[tile32.meta_group_rank()] = threadSum;
|
||||
}
|
||||
|
||||
auto token = barrier.arrive();
|
||||
|
||||
barrier.wait(std::move(token));
|
||||
|
||||
// The warp 0 will perform last round of reduction
|
||||
if (tile32.meta_group_rank() == 0) {
|
||||
double beta = tile32.thread_rank() < tile32.meta_group_size()
|
||||
? tmp[tile32.thread_rank()]
|
||||
: 0.0;
|
||||
__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
|
||||
cg::thread_block_tile<32> &tile32,
|
||||
double &threadSum,
|
||||
double *result)
|
||||
{
|
||||
extern __shared__ double tmp[];
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
||||
beta += tile32.shfl_down(beta, offset);
|
||||
threadSum += tile32.shfl_down(threadSum, offset);
|
||||
}
|
||||
if (tile32.thread_rank() == 0) {
|
||||
tmp[tile32.meta_group_rank()] = threadSum;
|
||||
}
|
||||
|
||||
if (tile32.thread_rank() == 0) {
|
||||
if (writeSquareRoot)
|
||||
*result = sqrt(beta);
|
||||
else
|
||||
*result = beta;
|
||||
auto token = barrier.arrive();
|
||||
|
||||
barrier.wait(std::move(token));
|
||||
|
||||
// The warp 0 will perform last round of reduction
|
||||
if (tile32.meta_group_rank() == 0) {
|
||||
double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
|
||||
|
||||
#pragma unroll
|
||||
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
|
||||
beta += tile32.shfl_down(beta, offset);
|
||||
}
|
||||
|
||||
if (tile32.thread_rank() == 0) {
|
||||
if (writeSquareRoot)
|
||||
*result = sqrt(beta);
|
||||
else
|
||||
*result = beta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
|
||||
double *partialResults, int size) {
|
||||
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#pragma diag_suppress static_var_with_dynamic_init
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
;
|
||||
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
||||
cg::thread_block cta = cg::this_thread_block();
|
||||
cg::grid_group grid = cg::this_grid();
|
||||
;
|
||||
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
|
||||
|
||||
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
|
||||
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
init(&barrier, blockDim.x);
|
||||
}
|
||||
|
||||
cg::sync(cta);
|
||||
|
||||
double threadSum = 0.0;
|
||||
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||
threadSum += (double)(vecA[i] * vecB[i]);
|
||||
}
|
||||
|
||||
// Each thread block performs reduction of partial dotProducts and writes to
|
||||
// global mem.
|
||||
reduceBlockData<false>(barrier, tile32, threadSum,
|
||||
&partialResults[blockIdx.x]);
|
||||
|
||||
cg::sync(grid);
|
||||
|
||||
// One block performs the final summation of partial dot products
|
||||
// of all the thread blocks and writes the sqrt of final dot product.
|
||||
if (blockIdx.x == 0) {
|
||||
threadSum = 0.0;
|
||||
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
|
||||
threadSum += partialResults[i];
|
||||
if (threadIdx.x == 0) {
|
||||
init(&barrier, blockDim.x);
|
||||
}
|
||||
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
|
||||
}
|
||||
|
||||
cg::sync(grid);
|
||||
cg::sync(cta);
|
||||
|
||||
const double finalValue = partialResults[0];
|
||||
double threadSum = 0.0;
|
||||
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||
threadSum += (double)(vecA[i] * vecB[i]);
|
||||
}
|
||||
|
||||
// Perform normalization of vecA & vecB.
|
||||
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||
vecA[i] = (float)vecA[i] / finalValue;
|
||||
vecB[i] = (float)vecB[i] / finalValue;
|
||||
}
|
||||
// Each thread block performs reduction of partial dotProducts and writes to
|
||||
// global mem.
|
||||
reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
|
||||
|
||||
cg::sync(grid);
|
||||
|
||||
// One block performs the final summation of partial dot products
|
||||
// of all the thread blocks and writes the sqrt of final dot product.
|
||||
if (blockIdx.x == 0) {
|
||||
threadSum = 0.0;
|
||||
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
|
||||
threadSum += partialResults[i];
|
||||
}
|
||||
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
|
||||
}
|
||||
|
||||
cg::sync(grid);
|
||||
|
||||
const double finalValue = partialResults[0];
|
||||
|
||||
// Perform normalization of vecA & vecB.
|
||||
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
|
||||
vecA[i] = (float)vecA[i] / finalValue;
|
||||
vecB[i] = (float)vecB[i] / finalValue;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
printf("%s starting...\n", argv[0]);
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("%s starting...\n", argv[0]);
|
||||
|
||||
// This will pick the best possible CUDA capable device
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
// This will pick the best possible CUDA capable device
|
||||
int dev = findCudaDevice(argc, (const char **)argv);
|
||||
|
||||
int major = 0;
|
||||
checkCudaErrors(
|
||||
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
||||
int major = 0;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
|
||||
|
||||
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
|
||||
if (major < 7) {
|
||||
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
int supportsCooperativeLaunch = 0;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
|
||||
cudaDevAttrCooperativeLaunch, dev));
|
||||
|
||||
if (!supportsCooperativeLaunch) {
|
||||
printf(
|
||||
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
|
||||
"Waiving the run\n",
|
||||
dev);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
|
||||
|
||||
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
|
||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
|
||||
float *vecA, *d_vecA;
|
||||
float *vecB, *d_vecB;
|
||||
double *d_partialResults;
|
||||
int size = 10000000;
|
||||
|
||||
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
|
||||
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
|
||||
|
||||
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
|
||||
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
|
||||
|
||||
float baseVal = 2.0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
vecA[i] = vecB[i] = baseVal;
|
||||
}
|
||||
|
||||
cudaStream_t stream;
|
||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
|
||||
cudaMemcpyHostToDevice, stream));
|
||||
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
|
||||
cudaMemcpyHostToDevice, stream));
|
||||
|
||||
// Kernel configuration, where a one-dimensional
|
||||
// grid and one-dimensional blocks are configured.
|
||||
int minGridSize = 0, blockSize = 0;
|
||||
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
|
||||
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
|
||||
|
||||
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
|
||||
|
||||
int numBlocksPerSm = 0;
|
||||
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
|
||||
|
||||
int multiProcessorCount = 0;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(
|
||||
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
|
||||
|
||||
minGridSize = multiProcessorCount * numBlocksPerSm;
|
||||
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
|
||||
|
||||
printf(
|
||||
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
|
||||
"blockSize = %d\n",
|
||||
minGridSize, blockSize);
|
||||
|
||||
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
|
||||
|
||||
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
|
||||
(void *)&d_partialResults, (void *)&size};
|
||||
|
||||
checkCudaErrors(
|
||||
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
|
||||
dimBlock, kernelArgs, smemSize, stream));
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
|
||||
cudaMemcpyDeviceToHost, stream));
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
|
||||
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
|
||||
unsigned int matches = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
if ((vecA[i] - expectedResult) > 0.00001) {
|
||||
printf("mismatch at i = %d\n", i);
|
||||
break;
|
||||
} else {
|
||||
matches++;
|
||||
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
|
||||
if (major < 7) {
|
||||
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
|
||||
checkCudaErrors(cudaFree(d_vecA));
|
||||
checkCudaErrors(cudaFree(d_vecB));
|
||||
checkCudaErrors(cudaFree(d_partialResults));
|
||||
int supportsCooperativeLaunch = 0;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
|
||||
|
||||
checkCudaErrors(cudaFreeHost(vecA));
|
||||
checkCudaErrors(cudaFreeHost(vecB));
|
||||
return matches == size;
|
||||
if (!supportsCooperativeLaunch) {
|
||||
printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
|
||||
"Waiving the run\n",
|
||||
dev);
|
||||
exit(EXIT_WAIVED);
|
||||
}
|
||||
|
||||
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
|
||||
|
||||
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
|
||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
|
||||
{
|
||||
float *vecA, *d_vecA;
|
||||
float *vecB, *d_vecB;
|
||||
double *d_partialResults;
|
||||
int size = 10000000;
|
||||
|
||||
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
|
||||
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
|
||||
|
||||
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
|
||||
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
|
||||
|
||||
float baseVal = 2.0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
vecA[i] = vecB[i] = baseVal;
|
||||
}
|
||||
|
||||
cudaStream_t stream;
|
||||
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
|
||||
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
|
||||
|
||||
// Kernel configuration, where a one-dimensional
|
||||
// grid and one-dimensional blocks are configured.
|
||||
int minGridSize = 0, blockSize = 0;
|
||||
checkCudaErrors(
|
||||
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
|
||||
|
||||
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
|
||||
|
||||
int numBlocksPerSm = 0;
|
||||
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
|
||||
|
||||
int multiProcessorCount = 0;
|
||||
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
|
||||
|
||||
minGridSize = multiProcessorCount * numBlocksPerSm;
|
||||
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
|
||||
|
||||
printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
|
||||
"blockSize = %d\n",
|
||||
minGridSize,
|
||||
blockSize);
|
||||
|
||||
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
|
||||
|
||||
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
|
||||
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
|
||||
|
||||
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
|
||||
checkCudaErrors(cudaStreamSynchronize(stream));
|
||||
|
||||
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
|
||||
unsigned int matches = 0;
|
||||
for (int i = 0; i < size; i++) {
|
||||
if ((vecA[i] - expectedResult) > 0.00001) {
|
||||
printf("mismatch at i = %d\n", i);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
matches++;
|
||||
}
|
||||
}
|
||||
|
||||
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
|
||||
checkCudaErrors(cudaFree(d_vecA));
|
||||
checkCudaErrors(cudaFree(d_vecB));
|
||||
checkCudaErrors(cudaFree(d_partialResults));
|
||||
|
||||
checkCudaErrors(cudaFreeHost(vecA));
|
||||
checkCudaErrors(cudaFreeHost(vecB));
|
||||
return matches == size;
|
||||
}
|
||||
|
@ -34,17 +34,17 @@
|
||||
#endif
|
||||
|
||||
// Includes, system
|
||||
#include <stdio.h>
|
||||
#include <cassert>
|
||||
#include <stdio.h>
|
||||
|
||||
// Includes CUDA
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
// Utilities and timing functions
|
||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
|
||||
|
||||
// CUDA helper functions
|
||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||
#include <helper_cuda.h> // helper functions for CUDA error check
|
||||
|
||||
const char *sampleName = "simpleAssert";
|
||||
|
||||
@ -58,9 +58,10 @@ bool testResult = true;
|
||||
//! Tests assert function.
|
||||
//! Thread whose id > N will print assertion failed error message.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__global__ void testKernel(int N) {
|
||||
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
assert(gtid < N);
|
||||
__global__ void testKernel(int N)
|
||||
{
|
||||
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
assert(gtid < N);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Program main
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv) {
|
||||
printf("%s starting...\n", sampleName);
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
printf("%s starting...\n", sampleName);
|
||||
|
||||
runTest(argc, argv);
|
||||
runTest(argc, argv);
|
||||
|
||||
printf("%s completed, returned %s\n", sampleName,
|
||||
testResult ? "OK" : "ERROR!");
|
||||
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
|
||||