Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks.

This commit is contained in:
Rob Armstrong 2025-03-27 10:30:07 -07:00
parent 2cd58fbc9a
commit ceab6e8bcc
782 changed files with 107230 additions and 106548 deletions

49
.clang-format Normal file
View File

@ -0,0 +1,49 @@
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveDeclarations: Consecutive
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Left
AlignOperands: AlignAfterOperator
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterExternBlock: true
AfterFunction: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
IndentBraces: false
BreakBeforeBraces: Custom
BreakBeforeConceptDeclarations: true
BreakBeforeBinaryOperators: NonAssignment
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
ColumnLimit: 120
DerivePointerAlignment: false
FixNamespaceComments: true
IncludeCategories:
- Regex: '^<.*>'
Priority: 1
- Regex: '^".*"'
Priority: 2
SortIncludes: true
IncludeBlocks: Regroup
IndentWidth: 4
MaxEmptyLinesToKeep: 2
PointerAlignment: Right
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
Standard: c++17
TabWidth: 4
UseTab: Never
...

100
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,100 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
ci:
autofix_commit_msg: |
[pre-commit.ci] auto code formatting
autofix_prs: false
autoupdate_branch: ''
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
autoupdate_schedule: quarterly
skip: []
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- id: mixed-line-ending
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- id: trailing-whitespace
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.6
hooks:
- id: clang-format
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$
)
exclude: |
(?x)^(
Common/.*
)
args: ["-fallback-style=none", "-style=file", "-i"]

View File

@ -31,10 +31,10 @@
*/
// system includes
#include <algorithm>
#include <cstdio>
#include <ctime>
#include <vector>
#include <algorithm>
#ifdef USE_PTHREADS
#include <pthread.h>
#else
@ -51,291 +51,287 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
// functions
void srand48(long seed) { srand((unsigned int)seed); }
void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; }
#endif
const char *sSDKname = "UnifiedMemoryStreams";
// simple task
template <typename T>
struct Task {
unsigned int size, id;
T *data;
T *result;
T *vector;
template <typename T> struct Task
{
unsigned int size, id;
T *data;
T *result;
T *vector;
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){};
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) {
// allocate unified memory -- the operation performed in this example will
// be a DGEMV
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
}
~Task() {
// ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data));
checkCudaErrors(cudaFree(result));
checkCudaErrors(cudaFree(vector));
}
void allocate(const unsigned int s, const unsigned int unique_id) {
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
Task()
: size(0)
, id(0)
, data(NULL)
, result(NULL)
, vector(NULL) {};
Task(unsigned int s)
: size(s)
, id(0)
, data(NULL)
, result(NULL)
{
// allocate unified memory -- the operation performed in this example will
// be a DGEMV
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
}
for (unsigned int i = 0; i < size; i++) {
result[i] = 0.;
vector[i] = drand48();
~Task()
{
// ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data));
checkCudaErrors(cudaFree(result));
checkCudaErrors(cudaFree(vector));
}
void allocate(const unsigned int s, const unsigned int unique_id)
{
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
}
for (unsigned int i = 0; i < size; i++) {
result[i] = 0.;
vector[i] = drand48();
}
}
}
};
#ifdef USE_PTHREADS
struct threadData_t {
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
struct threadData_t
{
int tid;
Task<double> *TaskListPtr;
cudaStream_t *streams;
cublasHandle_t *handles;
int taskSize;
};
typedef struct threadData_t threadData;
#endif
// simple host dgemv: assume data is in row-major format and square
template <typename T>
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) {
// rows
for (int i = 0; i < n; i++) {
result[i] *= beta;
template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
{
// rows
for (int i = 0; i < n; i++) {
result[i] *= beta;
for (int j = 0; j < n; j++) {
result[i] += A[i * n + j] * x[j];
for (int j = 0; j < n; j++) {
result[i] += A[i * n + j] * x[j];
}
}
}
}
// execute a single task on either host or device depending on size
#ifdef USE_PTHREADS
void *execute(void *inpArgs) {
threadData *dataPtr = (threadData *)inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid;
void *execute(void *inpArgs)
{
threadData *dataPtr = (threadData *)inpArgs;
cudaStream_t *stream = dataPtr->streams;
cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid;
for (int i = 0; i < dataPtr->taskSize; i++) {
Task<double> &t = dataPtr->TaskListPtr[i];
for (int i = 0; i < dataPtr->taskSize; i++) {
Task<double> &t = dataPtr->TaskListPtr[i];
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
t.size);
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
// attach managed memory to a (dummy) stream to allow host access while
// the device is running
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
} else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to a (dummy) stream to allow host access while
// the device is running
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
}
else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
}
}
}
pthread_exit(NULL);
pthread_exit(NULL);
}
#else
template <typename T>
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream,
int tid) {
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
t.size);
template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
{
if (t.size < 100) {
// perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
// attach managed memory to a (dummy) stream to allow host access while the
// device is running
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
} else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid,
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to a (dummy) stream to allow host access while the
// device is running
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors(cudaStreamSynchronize(stream[0]));
// call the host operation
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
}
else {
// perform on device
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0,
cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0,
cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
}
// attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
// call the device operation
checkCudaErrors(cublasDgemv(
handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
}
}
#endif
// populate a list of tasks with random sizes
template <typename T>
void initialise_tasks(std::vector<Task<T> > &TaskList) {
for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size
int size;
size = std::max((int)(drand48() * 1000.0), 64);
TaskList[i].allocate(size, i);
}
template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
{
for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size
int size;
size = std::max((int)(drand48() * 1000.0), 64);
TaskList[i].allocate(size, i);
}
}
int main(int argc, char **argv) {
// set device
cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
int main(int argc, char **argv)
{
// set device
cudaDeviceProp device_prop;
int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n");
if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED);
}
exit(EXIT_WAIVED);
}
if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode
fprintf(stderr,
"This sample requires a device in either default or process "
"exclusive mode\n");
if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode
fprintf(stderr,
"This sample requires a device in either default or process "
"exclusive mode\n");
exit(EXIT_WAIVED);
}
exit(EXIT_WAIVED);
}
// randomise task sizes
int seed = (int)time(NULL);
srand48(seed);
// randomise task sizes
int seed = (int)time(NULL);
srand48(seed);
// set number of threads
const int nthreads = 4;
// set number of threads
const int nthreads = 4;
// number of streams = number of threads
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
// number of streams = number of threads
cudaStream_t *streams = new cudaStream_t[nthreads + 1];
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
for (int i = 0; i < nthreads + 1; i++) {
checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cublasCreate(&handles[i]));
}
for (int i = 0; i < nthreads + 1; i++) {
checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cublasCreate(&handles[i]));
}
// create list of N tasks
unsigned int N = 40;
std::vector<Task<double> > TaskList(N);
initialise_tasks(TaskList);
// create list of N tasks
unsigned int N = 40;
std::vector<Task<double>> TaskList(N);
initialise_tasks(TaskList);
printf("Executing tasks on host / device\n");
printf("Executing tasks on host / device\n");
// run through all tasks using threads and streams
#ifdef USE_PTHREADS
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads];
for (int i = 0; i < nthreads; i++) {
checkCudaErrors(cudaSetDevice(dev_id));
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles;
for (int i = 0; i < nthreads; i++) {
checkCudaErrors(cudaSetDevice(dev_id));
InputToThreads[i].tid = i;
InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles;
if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads)];
} else {
if (i == nthreads - 1) {
InputToThreads[i].taskSize =
(TaskList.size() / nthreads) + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads) +
(TaskList.size() % nthreads)];
} else {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads)];
}
if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
}
else {
if (i == nthreads - 1) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
InputToThreads[i].TaskListPtr =
&TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
}
else {
InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
}
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
#else
omp_set_num_threads(nthreads);
omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic)
for (int i = 0; i < TaskList.size(); i++) {
checkCudaErrors(cudaSetDevice(dev_id));
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
for (int i = 0; i < TaskList.size(); i++) {
checkCudaErrors(cudaSetDevice(dev_id));
int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid);
}
#endif
cudaDeviceSynchronize();
cudaDeviceSynchronize();
// Destroy CUDA Streams, cuBlas handles
for (int i = 0; i < nthreads + 1; i++) {
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
// Destroy CUDA Streams, cuBlas handles
for (int i = 0; i < nthreads + 1; i++) {
cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]);
}
// Free TaskList
std::vector<Task<double> >().swap(TaskList);
// Free TaskList
std::vector<Task<double>>().swap(TaskList);
printf("All Done!\n");
exit(EXIT_SUCCESS);
printf("All Done!\n");
exit(EXIT_SUCCESS);
}

View File

@ -38,105 +38,107 @@
#include <stdio.h>
// includes CUDA Runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// includes, project
#include <helper_cuda.h>
#include <helper_functions.h> // helper utility functions
#include <helper_functions.h> // helper utility functions
__global__ void increment_kernel(int *g_data, int inc_value) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
__global__ void increment_kernel(int *g_data, int inc_value)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
}
bool correct_output(int *data, const int n, const int x) {
for (int i = 0; i < n; i++)
if (data[i] != x) {
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
return false;
bool correct_output(int *data, const int n, const int x)
{
for (int i = 0; i < n; i++)
if (data[i] != x) {
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
}
int main(int argc, char *argv[])
{
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv);
// get device name
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
}
return true;
}
int main(int argc, char *argv[]) {
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv);
// get device name
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
}
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
counter);
// check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFree(d_a));
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
// check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFree(d_a));
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -48,43 +48,46 @@
// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.
__global__ static void timedReduction(const float *input, float *output,
clock_t *timer) {
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
__global__ static void timedReduction(const float *input, float *output, clock_t *timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
if (tid == 0)
timer[bid] = clock();
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
if (tid == 0)
timer[bid + gridDim.x] = clock();
}
#define NUM_BLOCKS 64
#define NUM_BLOCKS 64
#define NUM_THREADS 256
// It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
// the memory. With more than 32 the speed scales linearly.
// Start the main CUDA Sample here
int main(int argc, char **argv) {
printf("CUDA Clock sample\n");
int main(int argc, char **argv)
{
printf("CUDA Clock sample\n");
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
float *dinput = NULL;
float *doutput = NULL;
clock_t *dtimer = NULL;
float *dinput = NULL;
float *doutput = NULL;
clock_t *dtimer = NULL;
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
checkCudaErrors(
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2,
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(
dinput, doutput, dtimer);
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer));
checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer));
long double avgElapsedClocks = 0;
long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}

View File

@ -34,12 +34,11 @@
*/
// System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <nvrtc_helper.h>
#include <stdint.h>
#include <stdio.h>
// helper functions and utilities to work with CUDA
#include <helper_functions.h>
@ -71,64 +70,68 @@
// Start the main CUDA Sample here
int main(int argc, char **argv) {
printf("CUDA Clock sample\n");
int main(int argc, char **argv)
{
printf("CUDA Clock sample\n");
typedef long clock_t;
typedef long clock_t;
clock_t timer[NUM_BLOCKS * 2];
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i;
}
char *cubin, *kernel_file;
size_t cubinSize;
char *cubin, *kernel_file;
size_t cubinSize;
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
dim3 cudaBlockSize(NUM_THREADS, 1, 1);
dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
CUdeviceptr dinput, doutput, dtimer;
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
CUdeviceptr dinput, doutput, dtimer;
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
checkCudaErrors(cuLaunchKernel(
kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x,
cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(dtimer));
checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(dtimer));
long double avgElapsedClocks = 0;
long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
}
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}

View File

@ -37,38 +37,41 @@
// time it takes to do that for each block. The timing results are stored
// in device memory.
extern "C" __global__ void timedReduction(const float *input, float *output,
clock_t *timer) {
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
if (tid == 0)
timer[bid] = clock();
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
if (tid == 0)
timer[bid + gridDim.x] = clock();
}

View File

@ -32,128 +32,125 @@
#include <helper_cuda.h>
#include <omp.h>
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
using namespace std;
// a simple kernel that simply increments each array element by b
__global__ void kernelAddConstant(int *g_a, const int b) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b;
__global__ void kernelAddConstant(int *g_a, const int b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b;
}
// a predicate that checks whether each array element is set to its index plus b
int correctResult(int *data, const int n, const int b) {
for (int i = 0; i < n; i++)
if (data[i] != i + b) return 0;
int correctResult(int *data, const int n, const int b)
{
for (int i = 0; i < n; i++)
if (data[i] != i + b)
return 0;
return 1;
return 1;
}
int main(int argc, char *argv[]) {
int num_gpus = 0; // number of CUDA GPUs
int main(int argc, char *argv[])
{
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]);
printf("%s Starting...\n\n", argv[0]);
/////////////////////////////////////////////////////////////////
// determine the number of CUDA capable GPUs
//
cudaGetDeviceCount(&num_gpus);
/////////////////////////////////////////////////////////////////
// determine the number of CUDA capable GPUs
//
cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1) {
printf("no CUDA capable devices were detected\n");
return 1;
}
if (num_gpus < 1) {
printf("no CUDA capable devices were detected\n");
return 1;
}
/////////////////////////////////////////////////////////////////
// display CPU and GPU configuration
//
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
/////////////////////////////////////////////////////////////////
// display CPU and GPU configuration
//
printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++) {
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
for (int i = 0; i < num_gpus; i++) {
cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name);
}
printf("---------------------------\n");
printf("---------------------------\n");
/////////////////////////////////////////////////////////////////
// initialize data
//
unsigned int n = num_gpus * 8192;
unsigned int nbytes = n * sizeof(int);
int *a = 0; // pointer to data on the CPU
int b = 3; // value by which the array is incremented
a = (int *)malloc(nbytes);
/////////////////////////////////////////////////////////////////
// initialize data
//
unsigned int n = num_gpus * 8192;
unsigned int nbytes = n * sizeof(int);
int *a = 0; // pointer to data on the CPU
int b = 3; // value by which the array is incremented
a = (int *)malloc(nbytes);
if (0 == a) {
printf("couldn't allocate CPU memory\n");
return 1;
}
if (0 == a) {
printf("couldn't allocate CPU memory\n");
return 1;
}
for (unsigned int i = 0; i < n; i++) a[i] = i;
for (unsigned int i = 0; i < n; i++)
a[i] = i;
////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices
// each CPU thread controls a different device, processing its
// portion of the data. It's possible to use more CPU threads
// than there are CUDA devices, in which case several CPU
// threads will be allocating resources and launching kernels
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
// Recall that all variables declared inside an "omp parallel" scope are
// local to each CPU thread
//
omp_set_num_threads(
num_gpus); // create as many CPU threads as there are CUDA devices
////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices
// each CPU thread controls a different device, processing its
// portion of the data. It's possible to use more CPU threads
// than there are CUDA devices, in which case several CPU
// threads will be allocating resources and launching kernels
// on the same device. For example, try omp_set_num_threads(2*num_gpus);
// Recall that all variables declared inside an "omp parallel" scope are
// local to each CPU thread
//
omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
// are CUDA devices
#pragma omp parallel
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
{
unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
checkCudaErrors(cudaSetDevice(
cpu_thread_id %
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
checkCudaErrors(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
num_cpu_threads, gpu_id);
// set and check the CUDA device for this CPU thread
int gpu_id = -1;
checkCudaErrors(
cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
checkCudaErrors(cudaGetDevice(&gpu_id));
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
int *d_a =
0; // pointer to memory on the device associated with this CPU thread
int *sub_a =
a +
cpu_thread_id * n /
num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
int *d_a = 0; // pointer to memory on the device associated with this CPU thread
int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors(
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
checkCudaErrors(
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_a));
}
printf("---------------------------\n");
checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(d_a));
}
printf("---------------------------\n");
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
////////////////////////////////////////////////////////////////
// check the result
//
bool bResult = correctResult(a, n, b);
////////////////////////////////////////////////////////////////
// check the result
//
bool bResult = correctResult(a, n, b);
if (a) free(a); // free CPU memory
if (a)
free(a); // free CPU memory
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -25,191 +25,188 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "cuda_fp16.h"
#include "helper_cuda.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#define NUM_OF_BLOCKS 128
#include "cuda_fp16.h"
#include "helper_cuda.h"
#define NUM_OF_BLOCKS 128
#define NUM_OF_THREADS 128
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) {
if (threadIdx.x < 64)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
__syncthreads();
if (threadIdx.x < 32)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
__syncthreads();
if (threadIdx.x < 16)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
__syncthreads();
if (threadIdx.x < 8)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
__syncthreads();
if (threadIdx.x < 4)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
__syncthreads();
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
{
if (threadIdx.x < 64)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
__syncthreads();
if (threadIdx.x < 32)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
__syncthreads();
if (threadIdx.x < 16)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
__syncthreads();
if (threadIdx.x < 8)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
__syncthreads();
if (threadIdx.x < 4)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
__syncthreads();
}
__forceinline__ __device__ void reduceInShared_native(half2 *const v) {
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
__syncthreads();
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
__syncthreads();
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
__syncthreads();
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
__syncthreads();
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
__syncthreads();
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
__syncthreads();
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
__syncthreads();
__forceinline__ __device__ void reduceInShared_native(half2 *const v)
{
if (threadIdx.x < 64)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
__syncthreads();
if (threadIdx.x < 32)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
__syncthreads();
if (threadIdx.x < 16)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
__syncthreads();
if (threadIdx.x < 8)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
__syncthreads();
if (threadIdx.x < 4)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
__syncthreads();
}
__global__ void scalarProductKernel_intrinsics(half2 const *const a,
half2 const *const b,
float *const results,
size_t const size) {
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
__global__ void
scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
{
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
shArray[threadIdx.x] = __float2half2_rn(0.f);
half2 value = __float2half2_rn(0.f);
shArray[threadIdx.x] = __float2half2_rn(0.f);
half2 value = __float2half2_rn(0.f);
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = __hfma2(a[i], b[i], value);
}
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = __hfma2(a[i], b[i], value);
}
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_intrinsics(shArray);
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_intrinsics(shArray);
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = __low2float(result) + __high2float(result);
results[blockIdx.x] = f_result;
}
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = __low2float(result) + __high2float(result);
results[blockIdx.x] = f_result;
}
}
__global__ void scalarProductKernel_native(half2 const *const a,
half2 const *const b,
float *const results,
size_t const size) {
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
__global__ void
scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
{
const int stride = gridDim.x * blockDim.x;
__shared__ half2 shArray[NUM_OF_THREADS];
half2 value(0.f, 0.f);
shArray[threadIdx.x] = value;
half2 value(0.f, 0.f);
shArray[threadIdx.x] = value;
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = a[i] * b[i] + value;
}
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = a[i] * b[i] + value;
}
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_native(shArray);
shArray[threadIdx.x] = value;
__syncthreads();
reduceInShared_native(shArray);
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = (float)result.y + (float)result.x;
results[blockIdx.x] = f_result;
}
if (threadIdx.x == 0) {
half2 result = shArray[0];
float f_result = (float)result.y + (float)result.x;
results[blockIdx.x] = f_result;
}
}
void generateInput(half2 *a, size_t size) {
for (size_t i = 0; i < size; ++i) {
half2 temp;
temp.x = static_cast<float>(rand() % 4);
temp.y = static_cast<float>(rand() % 2);
a[i] = temp;
}
void generateInput(half2 *a, size_t size)
{
for (size_t i = 0; i < size; ++i) {
half2 temp;
temp.x = static_cast<float>(rand() % 4);
temp.y = static_cast<float>(rand() % 2);
a[i] = temp;
}
}
int main(int argc, char *argv[]) {
srand((unsigned int)time(NULL));
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
int main(int argc, char *argv[])
{
srand((unsigned int)time(NULL));
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
half2 *vec[2];
half2 *devVec[2];
half2 *vec[2];
half2 *devVec[2];
float *results;
float *devResults;
float *results;
float *devResults;
int devID = findCudaDevice(argc, (const char **)argv);
int devID = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp devProp;
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
cudaDeviceProp devProp;
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
printf(
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"higher.\n");
return EXIT_WAIVED;
}
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"higher.\n");
return EXIT_WAIVED;
}
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
}
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
}
checkCudaErrors(
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
checkCudaErrors(
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
for (int i = 0; i < 2; ++i) {
generateInput(vec[i], size);
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i],
cudaMemcpyHostToDevice));
}
for (int i = 0; i < 2; ++i) {
generateInput(vec[i], size);
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
}
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
devVec[0], devVec[1], devResults, size);
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults,
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
float result_native = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_native += results[i];
}
printf("Result native operators\t: %f \n", result_native);
float result_native = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_native += results[i];
}
printf("Result native operators\t: %f \n", result_native);
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(
devVec[0], devVec[1], devResults, size);
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults,
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
float result_intrinsics = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_intrinsics += results[i];
}
printf("Result intrinsics\t: %f \n", result_intrinsics);
float result_intrinsics = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_intrinsics += results[i];
}
printf("Result intrinsics\t: %f \n", result_intrinsics);
printf("&&&& fp16ScalarProduct %s\n",
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
: "FAILED");
printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaFree(devVec[i]));
checkCudaErrors(cudaFreeHost(vec[i]));
}
for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaFree(devVec[i]));
checkCudaErrors(cudaFreeHost(vec[i]));
}
checkCudaErrors(cudaFree(devResults));
checkCudaErrors(cudaFreeHost(results));
checkCudaErrors(cudaFree(devResults));
checkCudaErrors(cudaFreeHost(results));
return EXIT_SUCCESS;
return EXIT_SUCCESS;
}

View File

@ -40,314 +40,303 @@
*/
// System includes
#include <stdio.h>
#include <assert.h>
#include <stdio.h>
// CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#include <helper_functions.h>
/**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B
* wA is A's width and wB is B's width
*/
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A,
float *B, int wA,
int wB) {
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin;
a <= aEnd;
a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
void ConstantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
void ConstantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
/**
* Run a simple test of matrix multiplication using CUDA
*/
int MatrixMultiply(int argc, char **argv,
int block_size, const dim3 &dimsA,
const dim3 &dimsB) {
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A;
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B;
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
cudaStream_t stream;
int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A;
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B;
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
cudaStream_t stream;
// Initialize host memory
const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB);
// Initialize host memory
const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB);
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate device memory
float *d_A, *d_B, *d_C;
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C;
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C;
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// copy host memory to device
checkCudaErrors(
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// copy host memory to device
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
// Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16) {
MatrixMulCUDA<16>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} else {
MatrixMulCUDA<32>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
printf("done\n");
checkCudaErrors(cudaStreamSynchronize(stream));
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
// Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16) {
MatrixMulCUDA<16>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} else {
MatrixMulCUDA<32>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
static_cast<double>(dimsA.y) *
static_cast<double>(dimsB.x);
double gigaFlops =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
// Copy result from device to host
checkCudaErrors(
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
i, h_C[i], dimsA.x * valB, eps);
correct = false;
else {
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("done\n");
checkCudaErrors(cudaStreamSynchronize(stream));
// Clean up memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
printf(
"\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n");
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
if (correct) {
return EXIT_SUCCESS;
} else {
return EXIT_FAILURE;
}
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
if (block_size == 16) {
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else {
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul =
2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul,
threads.x * threads.y);
// Copy result from device to host
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// Clean up memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
printf("\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n");
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
}
/**
* Program main
*/
int main(int argc, char **argv) {
printf("[Matrix Multiply Using CUDA] - Starting...\n");
int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices" \
" must be equal.\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices"
" must be equal.\n");
exit(EXIT_SUCCESS);
}
exit(EXIT_SUCCESS);
}
// This will pick the best possible CUDA capable device, otherwise
// override the device ID based on input provided at the command line
int dev = findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device, otherwise
// override the device ID based on input provided at the command line
int dev = findCudaDevice(argc, (const char **)argv);
int block_size = 32;
int block_size = 32;
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y,
dimsB.x, dimsB.y);
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
checkCudaErrors(cudaProfilerStop());
checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
checkCudaErrors(cudaProfilerStop());
exit(matrix_result);
exit(matrix_result);
}

View File

@ -30,11 +30,11 @@
// Matrix dimensions
// (chosen as multiples of the thread block size for simplicity)
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#endif // _MATRIXMUL_H_
#endif // _MATRIXMUL_H_

View File

@ -46,23 +46,23 @@
// includes, system
#include <builtin_types.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, project, CUDA
#include <cstring>
#include <cuda.h>
#include <helper_cuda_drvapi.h>
#include <helper_image.h>
#include <helper_string.h>
#include <helper_timer.h>
#include <cstring>
#include <iostream>
#include <string>
#include "matrixMul.h"
@ -71,11 +71,9 @@
void runTest(int argc, char **argv);
void randomInit(float *, int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int,
unsigned int, unsigned int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
int *blk_size);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
#ifndef FATBIN_FILE
#define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
////////////////////////////////////////////////////////////////////////////////
// Globals
////////////////////////////////////////////////////////////////////////////////
CUdevice cuDevice;
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
size_t totalGlobalMem;
CUmodule cuModule;
size_t totalGlobalMem;
const char *sSDKsample = "matrixMulDrv (Driver API)";
void constantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
void constantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("[ %s ]\n", sSDKsample);
int main(int argc, char **argv)
{
printf("[ %s ]\n", sSDKsample);
runTest(argc, argv);
runTest(argc, argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) {
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
void runTest(int argc, char **argv)
{
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
initCUDA(argc, argv, &matrixMul, &block_size);
initCUDA(argc, argv, &matrixMul, &block_size);
// set seed for rand()
srand(2006);
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
// allocate host memory for matrices A and B
unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
// initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// allocate device memory
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
// allocate device memory
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
// allocate mem for the result on host side
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method.
dim3 block(block_size, block_size, 1);
dim3 grid(WC / block_size, HC / block_size, 1);
// There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method.
dim3 block(block_size, block_size, 1);
dim3 grid(WC / block_size, HC / block_size, 1);
if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method)
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, args, NULL));
} else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
// pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is storing the value of the parameters
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
h_C[i], WA * valB);
correct = false;
if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method)
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(matrixMul,
grid.x,
grid.y,
grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
args,
NULL));
}
}
else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is storing the value of the parameters
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
// clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(cuContext));
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
void *kernel_launch_config[5] = {
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(matrixMul,
grid.x,
grid.y,
grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(cuContext));
}
// Allocates a matrix with random float entries.
void randomInit(float *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = rand() / static_cast<float>(RAND_MAX);
}
}
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
int *blk_size) {
CUfunction cuFunction = 0;
int major = 0, minor = 0;
char deviceName[100];
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n",
(long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
std::string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE);
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
"matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
} else {
block_size /= 2;
void randomInit(float *data, int size)
{
for (int i = 0; i < size; ++i) {
data[i] = rand() / static_cast<float>(RAND_MAX);
}
idx++;
}
*pMatrixMul = cuFunction;
*blk_size = block_size;
return 0;
}
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
{
CUfunction cuFunction = 0;
int major = 0, minor = 0;
char deviceName[100];
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
std::string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE);
}
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
}
else {
block_size /= 2;
}
idx++;
}
*pMatrixMul = cuFunction;
*blk_size = block_size;
return 0;
}

View File

@ -42,86 +42,87 @@
//! wA is A's width and wB is B's width
////////////////////////////////////////////////////////////////////////////////
template <int block_size, typename size_type>
__device__ void matrixMul(float *C, float *A, float *B, size_type wA,
size_type wB) {
// Block index
size_type bx = blockIdx.x;
size_type by = blockIdx.y;
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
{
// Block index
size_type bx = blockIdx.x;
size_type by = blockIdx.y;
// Thread index
size_type tx = threadIdx.x;
size_type ty = threadIdx.y;
// Thread index
size_type tx = threadIdx.x;
size_type ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
size_type aBegin = wA * block_size * by;
// Index of the first sub-matrix of A processed by the block
size_type aBegin = wA * block_size * by;
// Index of the last sub-matrix of A processed by the block
size_type aEnd = aBegin + wA - 1;
// Index of the last sub-matrix of A processed by the block
size_type aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
size_type aStep = block_size;
// Step size used to iterate through the sub-matrices of A
size_type aStep = block_size;
// Index of the first sub-matrix of B processed by the block
size_type bBegin = block_size * bx;
// Index of the first sub-matrix of B processed by the block
size_type bBegin = block_size * bx;
// Step size used to iterate through the sub-matrices of B
size_type bStep = block_size * wB;
// Step size used to iterate through the sub-matrices of B
size_type bStep = block_size * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[block_size][block_size];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[block_size][block_size];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[block_size][block_size];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[block_size][block_size];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Synchronize to make sure the matrices are loaded
__syncthreads();
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx);
for (size_type k = 0; k < block_size; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
}
// Write the block sub-matrix to device memory;
// each thread writes one element
size_type c = wB * block_size * by + block_size * bx;
C[c + wB * ty + tx] = Csub;
// Write the block sub-matrix to device memory;
// each thread writes one element
size_type c = wB * block_size * by + block_size * bx;
C[c + wB * ty + tx] = Csub;
}
// C wrappers around our template kernel
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B,
size_t wA, size_t wB) {
matrixMul<8, size_t>(C, A, B, wA, wB);
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
{
matrixMul<8, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B,
size_t wA, size_t wB) {
matrixMul<16, size_t>(C, A, B, wA, wB);
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
{
matrixMul<16, size_t>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B,
size_t wA, size_t wB) {
matrixMul<32, size_t>(C, A, B, wA, wB);
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
{
matrixMul<32, size_t>(C, A, B, wA, wB);
}
#endif // #ifndef _MATRIXMUL_KERNEL_H_
#endif // #ifndef _MATRIXMUL_KERNEL_H_

View File

@ -15,210 +15,211 @@
// With these flags defined, this source file will dynamically
// load the corresponding functions. Disabled by default.
//#define CUDA_INIT_D3D9
//#define CUDA_INIT_D3D10
//#define CUDA_INIT_D3D11
//#define CUDA_INIT_OPENGL
// #define CUDA_INIT_D3D9
// #define CUDA_INIT_D3D10
// #define CUDA_INIT_D3D11
// #define CUDA_INIT_OPENGL
#include <stdio.h>
#include "cuda_drvapi_dynlink.h"
tcuInit *_cuInit;
tcuDriverGetVersion *cuDriverGetVersion;
tcuDeviceGet *cuDeviceGet;
tcuDeviceGetCount *cuDeviceGetCount;
tcuDeviceGetName *cuDeviceGetName;
tcuDeviceComputeCapability *cuDeviceComputeCapability;
tcuDeviceTotalMem *cuDeviceTotalMem;
tcuDeviceGetProperties *cuDeviceGetProperties;
tcuDeviceGetAttribute *cuDeviceGetAttribute;
tcuGetErrorString *cuGetErrorString;
tcuCtxCreate *cuCtxCreate;
tcuCtxDestroy *cuCtxDestroy;
tcuCtxAttach *cuCtxAttach;
tcuCtxDetach *cuCtxDetach;
tcuCtxPushCurrent *cuCtxPushCurrent;
tcuCtxPopCurrent *cuCtxPopCurrent;
tcuCtxGetCurrent *cuCtxGetCurrent;
tcuCtxSetCurrent *cuCtxSetCurrent;
tcuCtxGetDevice *cuCtxGetDevice;
tcuCtxSynchronize *cuCtxSynchronize;
tcuModuleLoad *cuModuleLoad;
tcuModuleLoadData *cuModuleLoadData;
tcuModuleLoadDataEx *cuModuleLoadDataEx;
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
tcuModuleUnload *cuModuleUnload;
tcuModuleGetFunction *cuModuleGetFunction;
tcuModuleGetGlobal *cuModuleGetGlobal;
tcuModuleGetTexRef *cuModuleGetTexRef;
tcuModuleGetSurfRef *cuModuleGetSurfRef;
tcuMemGetInfo *cuMemGetInfo;
tcuMemAlloc *cuMemAlloc;
tcuMemAllocPitch *cuMemAllocPitch;
tcuMemFree *cuMemFree;
tcuMemGetAddressRange *cuMemGetAddressRange;
tcuMemAllocHost *cuMemAllocHost;
tcuMemFreeHost *cuMemFreeHost;
tcuMemHostAlloc *cuMemHostAlloc;
tcuMemHostGetFlags *cuMemHostGetFlags;
#include <stdio.h>
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
tcuIpcGetEventHandle *cuIpcGetEventHandle;
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
tcuIpcGetMemHandle *cuIpcGetMemHandle;
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
tcuInit *_cuInit;
tcuDriverGetVersion *cuDriverGetVersion;
tcuDeviceGet *cuDeviceGet;
tcuDeviceGetCount *cuDeviceGetCount;
tcuDeviceGetName *cuDeviceGetName;
tcuDeviceComputeCapability *cuDeviceComputeCapability;
tcuDeviceTotalMem *cuDeviceTotalMem;
tcuDeviceGetProperties *cuDeviceGetProperties;
tcuDeviceGetAttribute *cuDeviceGetAttribute;
tcuGetErrorString *cuGetErrorString;
tcuCtxCreate *cuCtxCreate;
tcuCtxDestroy *cuCtxDestroy;
tcuCtxAttach *cuCtxAttach;
tcuCtxDetach *cuCtxDetach;
tcuCtxPushCurrent *cuCtxPushCurrent;
tcuCtxPopCurrent *cuCtxPopCurrent;
tcuCtxGetCurrent *cuCtxGetCurrent;
tcuCtxSetCurrent *cuCtxSetCurrent;
tcuCtxGetDevice *cuCtxGetDevice;
tcuCtxSynchronize *cuCtxSynchronize;
tcuModuleLoad *cuModuleLoad;
tcuModuleLoadData *cuModuleLoadData;
tcuModuleLoadDataEx *cuModuleLoadDataEx;
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
tcuModuleUnload *cuModuleUnload;
tcuModuleGetFunction *cuModuleGetFunction;
tcuModuleGetGlobal *cuModuleGetGlobal;
tcuModuleGetTexRef *cuModuleGetTexRef;
tcuModuleGetSurfRef *cuModuleGetSurfRef;
tcuMemGetInfo *cuMemGetInfo;
tcuMemAlloc *cuMemAlloc;
tcuMemAllocPitch *cuMemAllocPitch;
tcuMemFree *cuMemFree;
tcuMemGetAddressRange *cuMemGetAddressRange;
tcuMemAllocHost *cuMemAllocHost;
tcuMemFreeHost *cuMemFreeHost;
tcuMemHostAlloc *cuMemHostAlloc;
tcuMemHostGetFlags *cuMemHostGetFlags;
tcuMemHostRegister *cuMemHostRegister;
tcuMemHostUnregister *cuMemHostUnregister;
tcuMemcpyHtoD *cuMemcpyHtoD;
tcuMemcpyDtoH *cuMemcpyDtoH;
tcuMemcpyDtoD *cuMemcpyDtoD;
tcuMemcpyDtoA *cuMemcpyDtoA;
tcuMemcpyAtoD *cuMemcpyAtoD;
tcuMemcpyHtoA *cuMemcpyHtoA;
tcuMemcpyAtoH *cuMemcpyAtoH;
tcuMemcpyAtoA *cuMemcpyAtoA;
tcuMemcpy2D *cuMemcpy2D;
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
tcuMemcpy3D *cuMemcpy3D;
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
tcuMemcpy2DAsync *cuMemcpy2DAsync;
tcuMemcpy3DAsync *cuMemcpy3DAsync;
tcuMemcpy *cuMemcpy;
tcuMemcpyPeer *cuMemcpyPeer;
tcuMemsetD8 *cuMemsetD8;
tcuMemsetD16 *cuMemsetD16;
tcuMemsetD32 *cuMemsetD32;
tcuMemsetD2D8 *cuMemsetD2D8;
tcuMemsetD2D16 *cuMemsetD2D16;
tcuMemsetD2D32 *cuMemsetD2D32;
tcuFuncSetBlockShape *cuFuncSetBlockShape;
tcuFuncSetSharedSize *cuFuncSetSharedSize;
tcuFuncGetAttribute *cuFuncGetAttribute;
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
tcuLaunchKernel *cuLaunchKernel;
tcuArrayCreate *cuArrayCreate;
tcuArrayGetDescriptor *cuArrayGetDescriptor;
tcuArrayDestroy *cuArrayDestroy;
tcuArray3DCreate *cuArray3DCreate;
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
tcuTexRefCreate *cuTexRefCreate;
tcuTexRefDestroy *cuTexRefDestroy;
tcuTexRefSetArray *cuTexRefSetArray;
tcuTexRefSetAddress *cuTexRefSetAddress;
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
tcuTexRefSetFormat *cuTexRefSetFormat;
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
tcuTexRefSetFlags *cuTexRefSetFlags;
tcuTexRefGetAddress *cuTexRefGetAddress;
tcuTexRefGetArray *cuTexRefGetArray;
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
tcuTexRefGetFormat *cuTexRefGetFormat;
tcuTexRefGetFlags *cuTexRefGetFlags;
tcuSurfRefSetArray *cuSurfRefSetArray;
tcuSurfRefGetArray *cuSurfRefGetArray;
tcuParamSetSize *cuParamSetSize;
tcuParamSeti *cuParamSeti;
tcuParamSetf *cuParamSetf;
tcuParamSetv *cuParamSetv;
tcuParamSetTexRef *cuParamSetTexRef;
tcuLaunch *cuLaunch;
tcuLaunchGrid *cuLaunchGrid;
tcuLaunchGridAsync *cuLaunchGridAsync;
tcuEventCreate *cuEventCreate;
tcuEventRecord *cuEventRecord;
tcuEventQuery *cuEventQuery;
tcuEventSynchronize *cuEventSynchronize;
tcuEventDestroy *cuEventDestroy;
tcuEventElapsedTime *cuEventElapsedTime;
tcuStreamCreate *cuStreamCreate;
tcuStreamWaitEvent *cuStreamWaitEvent;
tcuStreamAddCallback *cuStreamAddCallback;
tcuStreamQuery *cuStreamQuery;
tcuStreamSynchronize *cuStreamSynchronize;
tcuStreamDestroy *cuStreamDestroy;
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
tcuGraphicsMapResources *cuGraphicsMapResources;
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
tcuGetExportTable *cuGetExportTable;
tcuCtxSetLimit *cuCtxSetLimit;
tcuCtxGetLimit *cuCtxGetLimit;
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
tcuCtxGetApiVersion *cuCtxGetApiVersion;
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
tcuIpcGetEventHandle *cuIpcGetEventHandle;
tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
tcuIpcGetMemHandle *cuIpcGetMemHandle;
tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
tcuMemHostRegister *cuMemHostRegister;
tcuMemHostUnregister *cuMemHostUnregister;
tcuMemcpyHtoD *cuMemcpyHtoD;
tcuMemcpyDtoH *cuMemcpyDtoH;
tcuMemcpyDtoD *cuMemcpyDtoD;
tcuMemcpyDtoA *cuMemcpyDtoA;
tcuMemcpyAtoD *cuMemcpyAtoD;
tcuMemcpyHtoA *cuMemcpyHtoA;
tcuMemcpyAtoH *cuMemcpyAtoH;
tcuMemcpyAtoA *cuMemcpyAtoA;
tcuMemcpy2D *cuMemcpy2D;
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
tcuMemcpy3D *cuMemcpy3D;
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
tcuMemcpy2DAsync *cuMemcpy2DAsync;
tcuMemcpy3DAsync *cuMemcpy3DAsync;
tcuMemcpy *cuMemcpy;
tcuMemcpyPeer *cuMemcpyPeer;
tcuMemsetD8 *cuMemsetD8;
tcuMemsetD16 *cuMemsetD16;
tcuMemsetD32 *cuMemsetD32;
tcuMemsetD2D8 *cuMemsetD2D8;
tcuMemsetD2D16 *cuMemsetD2D16;
tcuMemsetD2D32 *cuMemsetD2D32;
tcuFuncSetBlockShape *cuFuncSetBlockShape;
tcuFuncSetSharedSize *cuFuncSetSharedSize;
tcuFuncGetAttribute *cuFuncGetAttribute;
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
tcuLaunchKernel *cuLaunchKernel;
tcuArrayCreate *cuArrayCreate;
tcuArrayGetDescriptor *cuArrayGetDescriptor;
tcuArrayDestroy *cuArrayDestroy;
tcuArray3DCreate *cuArray3DCreate;
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
tcuTexRefCreate *cuTexRefCreate;
tcuTexRefDestroy *cuTexRefDestroy;
tcuTexRefSetArray *cuTexRefSetArray;
tcuTexRefSetAddress *cuTexRefSetAddress;
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
tcuTexRefSetFormat *cuTexRefSetFormat;
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
tcuTexRefSetFlags *cuTexRefSetFlags;
tcuTexRefGetAddress *cuTexRefGetAddress;
tcuTexRefGetArray *cuTexRefGetArray;
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
tcuTexRefGetFormat *cuTexRefGetFormat;
tcuTexRefGetFlags *cuTexRefGetFlags;
tcuSurfRefSetArray *cuSurfRefSetArray;
tcuSurfRefGetArray *cuSurfRefGetArray;
tcuParamSetSize *cuParamSetSize;
tcuParamSeti *cuParamSeti;
tcuParamSetf *cuParamSetf;
tcuParamSetv *cuParamSetv;
tcuParamSetTexRef *cuParamSetTexRef;
tcuLaunch *cuLaunch;
tcuLaunchGrid *cuLaunchGrid;
tcuLaunchGridAsync *cuLaunchGridAsync;
tcuEventCreate *cuEventCreate;
tcuEventRecord *cuEventRecord;
tcuEventQuery *cuEventQuery;
tcuEventSynchronize *cuEventSynchronize;
tcuEventDestroy *cuEventDestroy;
tcuEventElapsedTime *cuEventElapsedTime;
tcuStreamCreate *cuStreamCreate;
tcuStreamWaitEvent *cuStreamWaitEvent;
tcuStreamAddCallback *cuStreamAddCallback;
tcuStreamQuery *cuStreamQuery;
tcuStreamSynchronize *cuStreamSynchronize;
tcuStreamDestroy *cuStreamDestroy;
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
tcuGraphicsMapResources *cuGraphicsMapResources;
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
tcuGetExportTable *cuGetExportTable;
tcuCtxSetLimit *cuCtxSetLimit;
tcuCtxGetLimit *cuCtxGetLimit;
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
tcuCtxGetApiVersion *cuCtxGetApiVersion;
tcuProfilerStop *cuProfilerStop;
tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
tcuProfilerStop *cuProfilerStop;
#ifdef CUDA_INIT_D3D9
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions
// are deprecated; please use the ones below
tcuD3D9Begin *cuD3D9Begin;
tcuD3D9End *cuD3DEnd;
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
tcuD3D9Begin *cuD3D9Begin;
tcuD3D9End *cuD3DEnd;
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
// D3D9/CUDA interop (CUDA 2.x compatible)
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
tcuD3D9RegisterResource *cuD3D9RegisterResource;
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
tcuD3D9MapResources *cuD3D9MapResources;
tcuD3D9UnmapResources *cuD3D9UnmapResources;
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
tcuD3D9RegisterResource *cuD3D9RegisterResource;
tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
tcuD3D9MapResources *cuD3D9MapResources;
tcuD3D9UnmapResources *cuD3D9UnmapResources;
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
// D3D9/CUDA interop (CUDA 2.0+)
tcuD3D9GetDevice *cuD3D9GetDevice;
tcuD3D9CtxCreate *cuD3D9CtxCreate;
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
tcuD3D9GetDevice *cuD3D9GetDevice;
tcuD3D9CtxCreate *cuD3D9CtxCreate;
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
#endif
#ifdef CUDA_INIT_D3D10
// D3D10/CUDA interop (CUDA 3.0+)
tcuD3D10GetDevice *cuD3D10GetDevice;
tcuD3D10CtxCreate *cuD3D10CtxCreate;
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
tcuD3D10GetDevice *cuD3D10GetDevice;
tcuD3D10CtxCreate *cuD3D10CtxCreate;
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
#endif
#ifdef CUDA_INIT_D3D11
// D3D11/CUDA interop (CUDA 3.0+)
tcuD3D11GetDevice *cuD3D11GetDevice;
tcuD3D11CtxCreate *cuD3D11CtxCreate;
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
tcuD3D11GetDevice *cuD3D11GetDevice;
tcuD3D11CtxCreate *cuD3D11CtxCreate;
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
#endif
// GL/CUDA interop
#ifdef CUDA_INIT_OPENGL
tcuGLCtxCreate *cuGLCtxCreate;
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
tcuGLCtxCreate *cuGLCtxCreate;
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
tcuWGLGetDevice *cuWGLGetDevice;
tcuWGLGetDevice *cuWGLGetDevice;
#endif
#endif
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{
*pInstance = LoadLibrary(__CudaLibName);
if (*pInstance == NULL)
{
if (*pInstance == NULL) {
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN;
}
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h>
#if defined(__APPLE__) || defined(__MACOSX)
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
#elif defined(__ANDROID__)
#if defined (__aarch64__)
#if defined(__aarch64__)
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
#elif defined(__arm__)
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{
*pInstance = dlopen(__CudaLibName, RTLD_NOW);
if (*pInstance == NULL)
{
if (*pInstance == NULL) {
printf("dlopen \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN;
}
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
#define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#else
#error unsupported platform
#endif
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while(0)
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while (0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
#define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
{
CUDADRIVER CudaDrvLib;
int driverVer = 1000;
int driverVer = 1000;
CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
// available since 2.2. if not present, version 1.0 is assumed
GET_PROC_OPTIONAL(cuDriverGetVersion);
if (cuDriverGetVersion)
{
if (cuDriverGetVersion) {
CHECKED_CALL(cuDriverGetVersion(&driverVer));
}
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuStreamDestroy);
// These are CUDA 5.0 new functions
if (driverVer >= 5000)
{
if (driverVer >= 5000) {
GET_PROC(cuMipmappedArrayCreate);
GET_PROC(cuMipmappedArrayDestroy);
GET_PROC(cuMipmappedArrayGetLevel);
}
// These are CUDA 4.2 new functions
if (driverVer >= 4020)
{
if (driverVer >= 4020) {
GET_PROC(cuFuncSetSharedMemConfig);
GET_PROC(cuCtxGetSharedMemConfig);
GET_PROC(cuCtxSetSharedMemConfig);
}
// These are CUDA 4.1 new functions
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
{
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
GET_PROC(cuDeviceGetByPCIBusId);
GET_PROC(cuDeviceGetPCIBusId);
GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
}
// These could be _v2 interfaces
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000)
{
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
GET_PROC_V2(cuCtxDestroy);
GET_PROC_V2(cuCtxPopCurrent);
GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuEventDestroy);
}
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
{
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
GET_PROC_V2(cuDeviceTotalMem);
GET_PROC_V2(cuCtxCreate);
GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuTexRefSetAddress);
GET_PROC_V2(cuTexRefGetAddress);
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010)
{
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
GET_PROC_V3(cuTexRefSetAddress2D);
}
else
{
else {
GET_PROC_V2(cuTexRefSetAddress2D);
}
}
else
{
else {
// versions earlier than 3020
GET_PROC(cuDeviceTotalMem);
GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
}
// The following functions are specific to CUDA versions
if (driverVer >= 4000)
{
if (driverVer >= 4000) {
GET_PROC(cuCtxSetCurrent);
GET_PROC(cuCtxGetCurrent);
GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuProfilerStop);
}
if (driverVer >= 3010)
{
if (driverVer >= 3010) {
GET_PROC(cuModuleGetSurfRef);
GET_PROC(cuSurfRefSetArray);
GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuCtxGetLimit);
}
if (driverVer >= 3000)
{
if (driverVer >= 3000) {
GET_PROC(cuMemcpyDtoDAsync);
GET_PROC(cuFuncSetCacheConfig);
#ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGraphicsUnregisterResource);
GET_PROC(cuGraphicsSubResourceGetMappedArray);
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020)
{
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
}
else
{
else {
GET_PROC(cuGraphicsResourceGetMappedPointer);
}
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGetExportTable);
}
if (driverVer >= 2030)
{
if (driverVer >= 2030) {
GET_PROC(cuMemHostGetFlags);
#ifdef CUDA_INIT_D3D10
GET_PROC(cuD3D10GetDevice);
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
#endif
}
if (driverVer >= 2010)
{
if (driverVer >= 2010) {
GET_PROC(cuModuleLoadDataEx);
GET_PROC(cuModuleLoadFatBinary);
#ifdef CUDA_INIT_OPENGL
GET_PROC(cuGLCtxCreate);
GET_PROC(cuGraphicsGLRegisterBuffer);
GET_PROC(cuGraphicsGLRegisterImage);
# ifdef WIN32
#ifdef WIN32
GET_PROC(cuWGLGetDevice);
# endif
#endif
#endif
#ifdef CUDA_INIT_D3D9
GET_PROC(cuD3D9GetDevice);

View File

@ -14,21 +14,17 @@
#ifndef HELPER_CUDA_DRVAPI_H
#define HELPER_CUDA_DRVAPI_H
#include <helper_string.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <helper_string.h>
#ifndef MAX
#define MAX(a, b) (a > b ? a : b)
#endif
#ifndef HELPER_CUDA_DRVAPI_H
inline int ftoi(float value) {
return (value >= 0 ? static_cast<int>(value + 0.5)
: static_cast<int>(value - 0.5));
}
inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
#endif
#ifndef EXIT_WAIVED
@ -47,311 +43,302 @@ inline int ftoi(float value) {
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) {
if (CUDA_SUCCESS != err) {
const char *errorStr = NULL;
cuGetErrorString(err, &errorStr);
fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
"line %i.\n",
err, errorStr, file, line);
exit(EXIT_FAILURE);
}
inline void __checkCudaErrors(CUresult err, const char *file, const int line)
{
if (CUDA_SUCCESS != err) {
const char *errorStr = NULL;
cuGetErrorString(err, &errorStr);
fprintf(stderr,
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
"line %i.\n",
err,
errorStr,
file,
line);
exit(EXIT_FAILURE);
}
}
#endif
// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
int device) {
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
}
#endif
// Beginning of GPU Architecture definitions
inline int _ConvertSMVer2CoresDRV(int major, int minor) {
// Defines for GPU Architecture types (using the SM version to determine the #
// of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
// minor version
int Cores;
} sSMtoCores;
inline int _ConvertSMVer2CoresDRV(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the #
// of cores per SM
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
// minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{0x90, 128},
{-1, -1}};
sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{0x90, 128},
{-1, -1}};
int index = 0;
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
index++;
}
// If we don't find the values, we default use the previous one to run
// properly
printf(
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
// If we don't find the values, we default use the previous one to run
// properly
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
major,
minor,
nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
}
// end of GPU Architecture definitions
// end of GPU Architecture definitions
#ifdef __cuda_cuda_h__
// General GPU Device CUDA Initialization
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) {
int cuDevice = 0;
int deviceCount = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
{
int cuDevice = 0;
int deviceCount = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
checkCudaErrors(cuDeviceGetCount(&deviceCount));
checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0) {
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
if (deviceCount == 0) {
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
int dev = 0;
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
int dev = 0;
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
if (dev < 0) {
dev = 0;
}
if (dev < 0) {
dev = 0;
}
if (dev > deviceCount - 1) {
fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
deviceCount);
fprintf(stderr,
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n",
dev);
fprintf(stderr, "\n");
return -dev;
}
if (dev > deviceCount - 1) {
fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
fprintf(stderr, "\n");
return -dev;
}
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
char name[100];
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGet(&cuDevice, dev));
char name[100];
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
fprintf(stderr,
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
"threads can use this CUDA Device.\n");
return -1;
}
if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
fprintf(stderr,
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
"threads can use this CUDA Device.\n");
return -1;
}
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
}
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
}
return dev;
return dev;
}
// This function returns the best GPU based on performance
inline int gpuGetMaxGflopsDeviceIdDRV() {
CUdevice current_device = 0;
CUdevice max_perf_device = 0;
int device_count = 0;
int sm_per_multiproc = 0;
unsigned long long max_compute_perf = 0;
int major = 0;
int minor = 0;
int multiProcessorCount;
int clockRate;
int devices_prohibited = 0;
inline int gpuGetMaxGflopsDeviceIdDRV()
{
CUdevice current_device = 0;
CUdevice max_perf_device = 0;
int device_count = 0;
int sm_per_multiproc = 0;
unsigned long long max_compute_perf = 0;
int major = 0;
int minor = 0;
int multiProcessorCount;
int clockRate;
int devices_prohibited = 0;
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
// Find the best CUDA capable GPU device
current_device = 0;
while (current_device < device_count) {
checkCudaErrors(cuDeviceGetAttribute(
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1;
} else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
}
unsigned long long compute_perf =
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
clockRate);
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
} else {
devices_prohibited++;
if (device_count == 0) {
fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
exit(EXIT_FAILURE);
}
++current_device;
}
// Find the best CUDA capable GPU device
current_device = 0;
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
"prohibited.\n");
exit(EXIT_FAILURE);
}
while (current_device < device_count) {
checkCudaErrors(
cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
return max_perf_device;
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1;
}
else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
}
unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
}
else {
devices_prohibited++;
}
++current_device;
}
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
"prohibited.\n");
exit(EXIT_FAILURE);
}
return max_perf_device;
}
// General initialization call to pick the best CUDA Device
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) {
CUdevice cuDevice;
int devID = 0;
inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
{
CUdevice cuDevice;
int devID = 0;
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
devID = gpuDeviceInitDRV(argc, argv);
// If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
devID = gpuDeviceInitDRV(argc, argv);
if (devID < 0) {
printf("exiting...\n");
exit(EXIT_SUCCESS);
if (devID < 0) {
printf("exiting...\n");
exit(EXIT_SUCCESS);
}
}
else {
// Otherwise pick the device with highest Gflops/s
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
cuDeviceGetName(name, 100, cuDevice);
printf("> Using CUDA Device [%d]: %s\n", devID, name);
}
} else {
// Otherwise pick the device with highest Gflops/s
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
cuDeviceGetName(name, 100, cuDevice);
printf("> Using CUDA Device [%d]: %s\n", devID, name);
}
cuDeviceGet(&cuDevice, devID);
cuDeviceGet(&cuDevice, devID);
return cuDevice;
return cuDevice;
}
inline CUdevice findIntegratedGPUDrv() {
CUdevice current_device = 0;
int device_count = 0;
int devices_prohibited = 0;
int isIntegrated;
inline CUdevice findIntegratedGPUDrv()
{
CUdevice current_device = 0;
int device_count = 0;
int devices_prohibited = 0;
int isIntegrated;
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) {
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
// Find the integrated GPU which is compute capable
while (current_device < device_count) {
int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute(
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
// If GPU is integrated and is not running on Compute Mode prohibited use
// that
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
current_device, deviceName, major, minor);
return current_device;
} else {
devices_prohibited++;
if (device_count == 0) {
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE);
}
current_device++;
}
// Find the integrated GPU which is compute capable
while (current_device < device_count) {
int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
if (devices_prohibited == device_count) {
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
exit(EXIT_FAILURE);
}
// If GPU is integrated and is not running on Compute Mode prohibited use
// that
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
return -1;
return current_device;
}
else {
devices_prohibited++;
}
current_device++;
}
if (devices_prohibited == device_count) {
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
exit(EXIT_FAILURE);
}
return -1;
}
// General check for CUDA GPU SM Capabilities
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version,
int devID) {
CUdevice cuDevice;
char name[256];
int major = 0, minor = 0;
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
{
CUdevice cuDevice;
char name[256];
int major = 0, minor = 0;
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
if ((major > major_version) ||
(major == major_version && minor >= minor_version)) {
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name,
major, minor);
return true;
} else {
printf(
"No GPU device was found that can support CUDA compute capability "
"%d.%d.\n",
major_version, minor_version);
return false;
}
if ((major > major_version) || (major == major_version && minor >= minor_version)) {
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
return true;
}
else {
printf("No GPU device was found that can support CUDA compute capability "
"%d.%d.\n",
major_version,
minor_version);
return false;
}
}
#endif
// end of CUDA Helper Functions
#endif // HELPER_CUDA_DRVAPI_H
// end of CUDA Helper Functions
#endif // HELPER_CUDA_DRVAPI_H

View File

@ -34,8 +34,8 @@
#define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#define HB WA // Matrix B height
#define WC WB // Matrix C width
#define HC HA // Matrix C height
#endif // _MATRIXMUL_H_

View File

@ -43,10 +43,10 @@
*/
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, CUDA
#include "cuda_drvapi_dynlink.h"
@ -60,7 +60,7 @@
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
#if defined _MSC_VER
#pragma warning (disable : 4312)
#pragma warning(disable : 4312)
#endif
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
// Globals
////////////////////////////////////////////////////////////////////////////////
CUcontext g_cuContext;
bool noprompt = false;
bool noprompt = false;
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
////////////////////////////////////////////////////////////////////////////////
void randomInit(float *data, size_t size)
{
for (size_t i = 0; i < size; ++i)
{
for (size_t i = 0; i < size; ++i) {
data[i] = rand() / (float)RAND_MAX;
}
}
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
////////////////////////////////////////////////////////////////////////////////
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
{
CUresult status;
CUdevice cuDevice;
CUmodule cuModule;
CUresult status;
CUdevice cuDevice;
CUmodule cuModule;
CUfunction cuFunction;
int major, minor, block_size, devID = 0;
char deviceName[256];
int major, minor, block_size, devID = 0;
char deviceName[256];
// link to cuda driver dynamically
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
// This assumes that the user is attempting to specify a explicit device -device=n
if (argc > 1)
{
if (argc > 1) {
bool bFound = false;
for (int param=0; param < argc; param++)
{
if (!strncmp(argv[param], "-device", 7))
{
int i=(int)strlen(argv[1]);
for (int param = 0; param < argc; param++) {
if (!strncmp(argv[param], "-device", 7)) {
int i = (int)strlen(argv[1]);
while (argv[1][i] != '=')
{
while (argv[1][i] != '=') {
i--;
}
devID = atoi(&argv[1][++i]);
devID = atoi(&argv[1][++i]);
bFound = true;
}
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
int deviceCount = 0;
checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0)
{
if (deviceCount == 0) {
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
exit(EXIT_SUCCESS);
}
if (devID < 0) devID = 0;
if (devID < 0)
devID = 0;
if (devID > deviceCount -1)
{
if (devID > deviceCount - 1) {
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
status = CUDA_ERROR_NOT_FOUND;
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
block_size = 32;
block_size = 32;
*block_size_out = block_size;
// create context for picked device
status = cuCtxCreate(&g_cuContext, 0, cuDevice);
if (CUDA_SUCCESS != status)
{
if (CUDA_SUCCESS != status) {
cuCtxDestroy(g_cuContext);
exit(EXIT_SUCCESS);
}
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
{
// in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer;
jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS;
jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount;
jitOptVals[2] = (void *)(size_t)jitRegCount;
// compile with set parameters
printf("> Compiling CUDA module\n");
#if defined(_WIN64) || defined(__LP64__)
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#else
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#endif
printf("> PTX JIT log:\n%s\n", jitLogBuffer);
delete [] jitOptions;
delete [] jitOptVals;
delete [] jitLogBuffer;
delete[] jitOptions;
delete[] jitOptVals;
delete[] jitLogBuffer;
}
if (CUDA_SUCCESS != status)
{
if (CUDA_SUCCESS != status) {
printf("Error while compiling PTX\n");
cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE);
}
// retrieve CUDA function from the compiled module
status = cuModuleGetFunction(&cuFunction, cuModule,
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
status = cuModuleGetFunction(
&cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
if (CUDA_SUCCESS != status)
{
if (CUDA_SUCCESS != status) {
cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE);
}
@ -233,21 +226,21 @@ int main(int argc, char **argv)
printf("[ %s ]\n", sSDKsample);
// initialize CUDA
CUfunction matrixMul = NULL;
int block_size = 0;
CUfunction matrixMul = NULL;
int block_size = 0;
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
// set seed for rand()
srand(2006);
// allocate host memory for matrices A and B
size_t size_A = WA * HA;
size_t mem_size_A = sizeof(float) * size_A;
size_t size_B = WB * HB;
size_t mem_size_B = sizeof(float) * size_B;
size_t size_A = WA * HA;
size_t mem_size_A = sizeof(float) * size_A;
size_t size_B = WB * HB;
size_t mem_size_B = sizeof(float) * size_B;
float *h_A = (float *) malloc(mem_size_A);
float *h_B = (float *) malloc(mem_size_B);
float *h_A = (float *)malloc(mem_size_A);
float *h_B = (float *)malloc(mem_size_B);
// initialize host memory
randomInit(h_A, size_A);
@ -264,26 +257,24 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side
float *h_C = (float *) malloc(mem_size_C);
float *h_C = (float *)malloc(mem_size_C);
#if __CUDA_API_VERSION >= 4000
{
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B };
int Matrix_Width_A = WA;
int Matrix_Width_B = WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1,
block_size , block_size , 1,
0,
NULL, args, NULL));
checkCudaErrors(cuLaunchKernel(
matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
}
#else // __CUDA_API_VERSION <= 3020
{
@ -312,7 +303,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuParamSetSize(matrixMul, offset));
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float)));
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
// set execution configuration for the CUDA kernel
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
@ -322,19 +313,18 @@ int main(int argc, char **argv)
checkCudaErrors(cuCtxSynchronize());
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C));
checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
// compute reference solution
float *reference = (float *) malloc(mem_size_C);
float *reference = (float *)malloc(mem_size_C);
computeGold(reference, h_A, h_B, HA, WA, WB);
// check result
float diff=0.0f;
float diff = 0.0f;
for (unsigned int i=0; i<size_C; i++)
{
for (unsigned int i = 0; i < size_C; i++) {
float tmp = reference[i] - h_C[i];
diff += tmp*tmp;
diff += tmp * tmp;
}
int res = (diff / (float)size_C < 1e-6f);
@ -349,7 +339,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(g_cuContext));
printf("Test run %s\n", (1==res) ? "success!" : "failed!");
printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -28,8 +28,7 @@
////////////////////////////////////////////////////////////////////////////////
// export C interface
extern "C"
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
//! @param hA height of matrix A
//! @param wB width of matrix B
////////////////////////////////////////////////////////////////////////////////
void
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{
for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j)
{
for (unsigned int j = 0; j < wB; ++j) {
double sum = 0;
for (unsigned int k = 0; k < wA; ++k)
{
for (unsigned int k = 0; k < wA; ++k) {
double a = A[i * wA + k];
double b = B[k * wB + j];
sum += a * b;

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_32_ptxdump_h__
#if defined __cplusplus
extern "C" {
extern "C"
{
#endif
extern unsigned char matrixMul_kernel_32_ptxdump[25784];

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_64_ptxdump_h__
#if defined __cplusplus
extern "C" {
extern "C"
{
#endif
extern unsigned char matrixMul_kernel_64_ptxdump[26489];

View File

@ -42,207 +42,208 @@
*/
// System includes
#include <stdio.h>
#include <assert.h>
#include <stdio.h>
// CUDA runtime
#include <cuda_runtime.h>
#include "nvrtc_helper.h"
// Helper functions and utilities to work with CUDA
#include <helper_functions.h>
void constantInit(float *data, int size, float val) {
for (int i = 0; i < size; ++i) {
data[i] = val;
}
void constantInit(float *data, int size, float val)
{
for (int i = 0; i < size; ++i) {
data[i] = val;
}
}
/**
* Run a simple test of matrix multiplication using CUDA
*/
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA,
dim3 &dimsB) {
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
{
// Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);
// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// Initialize host memory
const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB);
// Allocate device memory
CUdeviceptr d_A, d_B, d_C;
// Allocate device memory
CUdeviceptr d_A, d_B, d_C;
char *cubin, *kernel_file;
size_t cubinSize;
char *cubin, *kernel_file;
size_t cubinSize;
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUmodule module = loadCUBIN(cubin, argc, argv);
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *)malloc(mem_size_C);
// Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *)malloc(mem_size_C);
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
CUfunction kernel_addr;
if (block_size == 16) {
checkCudaErrors(
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
} else {
checkCudaErrors(
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
(void *)&dimsB.x};
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
checkCudaErrors(
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
threads.x, threads.y, threads.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
}
// Copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
h_C[i], dimsA.x * valB, eps);
correct = false;
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE);
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// Clean up memory
free(h_A);
free(h_B);
free(h_C);
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
if (correct) {
return EXIT_SUCCESS;
} else {
return EXIT_FAILURE;
}
CUfunction kernel_addr;
if (block_size == 16) {
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
}
else {
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
checkCudaErrors(cuLaunchKernel(kernel_addr,
grid.x,
grid.y,
grid.z, /* grid dim */
threads.x,
threads.y,
threads.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
}
// Copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// Clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
}
/**
* Program main
*/
int main(int argc, char **argv) {
printf("[Matrix Multiply Using CUDA] - Starting...\n");
int main(int argc, char **argv)
{
printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
printf("Usage -device=n (n >= 0 for deviceID)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
exit(EXIT_SUCCESS);
}
exit(EXIT_SUCCESS);
}
int block_size = 32;
int block_size = 32;
// original:
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// original:
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// reduce sizes to avoid running out of memory
// dim3 dimsA(32,32, 1);
// dim3 dimsB(32,32,1);
// reduce sizes to avoid running out of memory
// dim3 dimsA(32,32, 1);
// dim3 dimsB(32,32,1);
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
// height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n",
dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
exit(EXIT_FAILURE);
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x,
dimsB.y);
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
exit(matrix_result);
exit(matrix_result);
}

View File

@ -48,84 +48,83 @@
#include <cooperative_groups.h>
template <int BLOCK_SIZE>
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) {
// Handle to thread block group
cooperative_groups::thread_block cta =
cooperative_groups::this_thread_block();
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
{
// Handle to thread block group
cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to
// store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Load the matrices from device memory
// to shared memory; each thread loads
// one element of each matrix
As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded
cooperative_groups::sync(cta);
// Synchronize to make sure the matrices are loaded
cooperative_groups::sync(cta);
// Multiply the two matrices together;
// each thread computes one element
// of the block sub-matrix
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
cooperative_groups::sync(cta);
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
cooperative_groups::sync(cta);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
}
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B,
int wA, int wB) {
matrixMulCUDA<16>(C, A, B, wA, wB);
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
{
matrixMulCUDA<16>(C, A, B, wA, wB);
}
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B,
int wA, int wB) {
matrixMulCUDA<32>(C, A, B, wA, wB);
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
{
matrixMulCUDA<32>(C, A, B, wA, wB);
}

View File

@ -28,252 +28,254 @@
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <assert.h>
#include <helper_cuda.h>
#include "mergeSort_common.h"
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB,
uint &valB, uint arrowDir) {
uint t;
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
{
uint t;
if ((keyA > keyB) == arrowDir) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
}
if ((keyA > keyB) == arrowDir) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
}
}
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint arrayLength, uint sortDir) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// Shared memory storage for one or more short vectors
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
__global__ void
bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
// Shared memory storage for one or more short vectors
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
// Offset to the beginning of subbatch and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
// Offset to the beginning of subbatch and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint size = 2; size < arrayLength; size <<= 1) {
// Bitonic merge
uint dir = (threadIdx.x & (size / 2)) != 0;
for (uint size = 2; size < arrayLength; size <<= 1) {
// Bitonic merge
uint dir = (threadIdx.x & (size / 2)) != 0;
for (uint stride = size / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
s_val[pos + stride], dir);
for (uint stride = size / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
}
}
}
// ddd == sortDir for the last bitonic merge step
{
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride],
s_val[pos + stride], sortDir);
// ddd == sortDir for the last bitonic merge step
{
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
}
}
}
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
// Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L) {
if (!L) {
*log2L = 0;
return 0;
} else {
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
;
extern "C" uint factorRadix2(uint *log2L, uint L)
{
if (!L) {
*log2L = 0;
return 0;
}
else {
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
;
return L;
}
return L;
}
}
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint batchSize, uint arrayLength,
uint sortDir) {
// Nothing to sort
if (arrayLength < 2) {
return;
}
extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir)
{
// Nothing to sort
if (arrayLength < 2) {
return;
}
// Only power-of-two array lengths are supported by this implementation
uint log2L;
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
assert(factorizationRemainder == 1);
// Only power-of-two array lengths are supported by this implementation
uint log2L;
uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
assert(factorizationRemainder == 1);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
assert(arrayLength <= SHARED_SIZE_LIMIT);
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
assert(arrayLength <= SHARED_SIZE_LIMIT);
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
bitonicSortSharedKernel<<<blockCount, threadCount>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals
////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
static inline __host__ __device__ uint getSampleCount(uint dividend) {
return iDivUp(dividend, SAMPLE_STRIDE);
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
template <uint sortDir>
static inline __device__ void
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
{
uint t;
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
|| ((arrowDir != sortDir) && (flagB == 1))) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
t = flagA;
flagA = flagB;
flagB = t;
}
}
template <uint sortDir>
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA,
uint &flagA, uint &keyB,
uint &valB, uint &flagB,
uint arrowDir) {
uint t;
__global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) ||
((arrowDir == sortDir) && (flagA == 1)) ||
((arrowDir != sortDir) && (flagB == 1))) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
t = flagA;
flagA = flagB;
flagB = t;
}
}
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
template <uint sortDir>
__global__ void bitonicMergeElementaryIntervalsKernel(
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
// Set up threadblock-wide parameters
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = stride / SAMPLE_STRIDE;
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
// Set up threadblock-wide parameters
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
startDst = startSrcA + startSrcB;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = stride / SAMPLE_STRIDE;
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
}
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
startDst = startSrcA + startSrcB;
s_inf[threadIdx.x + 0] = 1;
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
: segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
}
s_inf[threadIdx.x + 0] = 1;
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
// Load input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
s_inf[threadIdx.x] = 0;
}
// Prepare for bitonic merge by inversing the ordering
if (threadIdx.x < lenSrcB) {
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
}
//"Extended" bitonic merge
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
// Load input data
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
s_key[pos + stride], s_val[pos + stride],
s_inf[pos + stride], sortDir);
}
// Store sorted data
cg::sync(cta);
d_DstKey += startDst;
d_DstVal += startDst;
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
s_inf[threadIdx.x] = 0;
}
if (threadIdx.x < lenSrcA) {
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
}
// Prepare for bitonic merge by inversing the ordering
if (threadIdx.x < lenSrcB) {
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
}
if (threadIdx.x < lenSrcB) {
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
//"Extended" bitonic merge
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0],
s_val[pos + 0],
s_inf[pos + 0],
s_key[pos + stride],
s_val[pos + stride],
s_inf[pos + stride],
sortDir);
}
// Store sorted data
cg::sync(cta);
d_DstKey += startDst;
d_DstVal += startDst;
if (threadIdx.x < lenSrcA) {
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB, uint stride,
uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else {
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
if (sortDir) {
bitonicMergeElementaryIntervalsKernel<1U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
}
else {
bitonicMergeElementaryIntervalsKernel<0U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}

View File

@ -26,96 +26,94 @@
*/
#include <assert.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Test driver
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
StopWatchInterface *hTimer = NULL;
int main(int argc, char **argv)
{
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
StopWatchInterface *hTimer = NULL;
const uint N = 4 * 1048576;
const uint DIR = 1;
const uint numValues = 65536;
const uint N = 4 * 1048576;
const uint DIR = 1;
const uint numValues = 65536;
printf("%s Starting...\n\n", argv[0]);
printf("%s Starting...\n\n", argv[0]);
int dev = findCudaDevice(argc, (const char **)argv);
int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1) {
return EXIT_FAILURE;
}
if (dev == -1) {
return EXIT_FAILURE;
}
printf("Allocating and initializing host arrays...\n\n");
sdkCreateTimer(&hTimer);
h_SrcKey = (uint *)malloc(N * sizeof(uint));
h_SrcVal = (uint *)malloc(N * sizeof(uint));
h_DstKey = (uint *)malloc(N * sizeof(uint));
h_DstVal = (uint *)malloc(N * sizeof(uint));
printf("Allocating and initializing host arrays...\n\n");
sdkCreateTimer(&hTimer);
h_SrcKey = (uint *)malloc(N * sizeof(uint));
h_SrcVal = (uint *)malloc(N * sizeof(uint));
h_DstKey = (uint *)malloc(N * sizeof(uint));
h_DstVal = (uint *)malloc(N * sizeof(uint));
srand(2009);
srand(2009);
for (uint i = 0; i < N; i++) {
h_SrcKey[i] = rand() % numValues;
}
for (uint i = 0; i < N; i++) {
h_SrcKey[i] = rand() % numValues;
}
fillValues(h_SrcVal, N);
fillValues(h_SrcVal, N);
printf("Allocating and initializing CUDA arrays...\n\n");
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
checkCudaErrors(
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
printf("Allocating and initializing CUDA arrays...\n\n");
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
printf("Initializing GPU merge sort...\n");
initMergeSort();
printf("Initializing GPU merge sort...\n");
initMergeSort();
printf("Running GPU merge sort...\n");
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
printf("Running GPU merge sort...\n");
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
printf("Reading back GPU merge sort results...\n");
checkCudaErrors(
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
printf("Reading back GPU merge sort results...\n");
checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
printf("Inspecting the results...\n");
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
printf("Inspecting the results...\n");
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
printf("Shutting down...\n");
closeMergeSort();
sdkDeleteTimer(&hTimer);
checkCudaErrors(cudaFree(d_SrcVal));
checkCudaErrors(cudaFree(d_SrcKey));
checkCudaErrors(cudaFree(d_BufVal));
checkCudaErrors(cudaFree(d_BufKey));
checkCudaErrors(cudaFree(d_DstVal));
checkCudaErrors(cudaFree(d_DstKey));
free(h_DstVal);
free(h_DstKey);
free(h_SrcVal);
free(h_SrcKey);
printf("Shutting down...\n");
closeMergeSort();
sdkDeleteTimer(&hTimer);
checkCudaErrors(cudaFree(d_SrcVal));
checkCudaErrors(cudaFree(d_SrcKey));
checkCudaErrors(cudaFree(d_BufVal));
checkCudaErrors(cudaFree(d_BufKey));
checkCudaErrors(cudaFree(d_DstVal));
checkCudaErrors(cudaFree(d_DstKey));
free(h_DstVal);
free(h_DstKey);
free(h_SrcVal);
free(h_SrcKey);
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
}

View File

@ -39,491 +39,499 @@
namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) {
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
static inline __host__ __device__ uint getSampleCount(uint dividend) {
return iDivUp(dividend, SAMPLE_STRIDE);
}
static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
#define W (sizeof(uint) * 8)
static inline __device__ uint nextPowerOfTwo(uint x) {
/*
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
*/
return 1U << (W - __clz(x - 1));
static inline __device__ uint nextPowerOfTwo(uint x)
{
/*
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
*/
return 1U << (W - __clz(x - 1));
}
template <uint sortDir>
static inline __device__ uint binarySearchInclusive(uint val, uint *data,
uint L, uint stride) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) ||
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
}
}
return pos;
}
template <uint sortDir>
static inline __device__ uint binarySearchExclusive(uint val, uint *data,
uint L, uint stride) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) ||
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
}
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based)
////////////////////////////////////////////////////////////////////////////////
template <uint sortDir>
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint arrayLength) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
uint lPos = threadIdx.x & (stride - 1);
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
for (uint stride = 1; stride < arrayLength; stride <<= 1) {
uint lPos = threadIdx.x & (stride - 1);
uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
cg::sync(cta);
uint keyA = baseKey[lPos + 0];
uint valA = baseVal[lPos + 0];
uint keyB = baseKey[lPos + stride];
uint valB = baseVal[lPos + stride];
uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
cg::sync(cta);
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
cg::sync(cta);
uint keyA = baseKey[lPos + 0];
uint valA = baseVal[lPos + 0];
uint keyB = baseKey[lPos + stride];
uint valB = baseVal[lPos + stride];
uint posA =
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
lPos;
uint posB =
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
lPos;
cg::sync(cta);
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
}
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey,
uint *d_SrcVal, uint batchSize, uint arrayLength,
uint sortDir) {
if (arrayLength < 2) {
return;
}
static void mergeSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir)
{
if (arrayLength < 2) {
return;
}
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
assert(SHARED_SIZE_LIMIT % arrayLength == 0);
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2;
if (sortDir) {
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
} else {
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
}
if (sortDir) {
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<1><<<>>> failed\n");
}
else {
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
getLastCudaError("mergeSortShared<0><<<>>> failed\n");
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks
////////////////////////////////////////////////////////////////////////////////
template <uint sortDir>
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB,
uint *d_SrcKey, uint stride, uint N,
uint threadCount) {
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
__global__ void
generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
{
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) {
return;
}
if (pos >= threadCount) {
return;
}
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_SrcKey += segmentBase;
d_RanksA += segmentBase / SAMPLE_STRIDE;
d_RanksB += segmentBase / SAMPLE_STRIDE;
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_SrcKey += segmentBase;
d_RanksA += segmentBase / SAMPLE_STRIDE;
d_RanksB += segmentBase / SAMPLE_STRIDE;
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA) {
d_RanksA[i] = i * SAMPLE_STRIDE;
d_RanksB[i] = binarySearchExclusive<sortDir>(
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB,
nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesA) {
d_RanksA[i] = i * SAMPLE_STRIDE;
d_RanksB[i] = binarySearchExclusive<sortDir>(
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
}
if (i < segmentSamplesB) {
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA,
nextPowerOfTwo(segmentElementsA));
}
if (i < segmentSamplesB) {
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
}
}
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint threadCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
if (sortDir) {
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>(
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
} else {
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>(
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
}
if (sortDir) {
generateSampleRanksKernel<1U>
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
}
else {
generateSampleRanksKernel<0U>
<<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices
////////////////////////////////////////////////////////////////////////////////
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks,
uint stride, uint N,
uint threadCount) {
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
{
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) {
return;
}
if (pos >= threadCount) {
return;
}
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_Ranks += (pos - i) * 2;
d_Limits += (pos - i) * 2;
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_Ranks += (pos - i) * 2;
d_Limits += (pos - i) * 2;
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA) {
uint dstPos = binarySearchExclusive<1U>(
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB,
nextPowerOfTwo(segmentSamplesB)) +
i;
d_Limits[dstPos] = d_Ranks[i];
}
if (i < segmentSamplesA) {
uint dstPos = binarySearchExclusive<1U>(
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
+ i;
d_Limits[dstPos] = d_Ranks[i];
}
if (i < segmentSamplesB) {
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i],
d_Ranks, segmentSamplesA,
nextPowerOfTwo(segmentSamplesA)) +
i;
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
}
if (i < segmentSamplesB) {
uint dstPos = binarySearchInclusive<1U>(
d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
+ i;
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
}
}
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB,
uint *d_RanksA, uint *d_RanksB, uint stride,
uint N) {
uint lastSegmentElements = N % (2 * stride);
uint threadCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
{
uint lastSegmentElements = N % (2 * stride);
uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
d_LimitsA, d_RanksA, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(
d_LimitsB, d_RanksB, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals
////////////////////////////////////////////////////////////////////////////////
template <uint sortDir>
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey,
uint *srcAVal, uint *srcBKey, uint *srcBVal,
uint lenA, uint nPowTwoLenA, uint lenB,
uint nPowTwoLenB, cg::thread_block cta) {
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
inline __device__ void merge(uint *dstKey,
uint *dstVal,
uint *srcAKey,
uint *srcAVal,
uint *srcBKey,
uint *srcBVal,
uint lenA,
uint nPowTwoLenA,
uint lenB,
uint nPowTwoLenB,
cg::thread_block cta)
{
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
if (threadIdx.x < lenA) {
keyA = srcAKey[threadIdx.x];
valA = srcAVal[threadIdx.x];
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) +
threadIdx.x;
}
if (threadIdx.x < lenA) {
keyA = srcAKey[threadIdx.x];
valA = srcAVal[threadIdx.x];
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
}
if (threadIdx.x < lenB) {
keyB = srcBKey[threadIdx.x];
valB = srcBVal[threadIdx.x];
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) +
threadIdx.x;
}
if (threadIdx.x < lenB) {
keyB = srcBKey[threadIdx.x];
valB = srcBVal[threadIdx.x];
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
}
cg::sync(cta);
cg::sync(cta);
if (threadIdx.x < lenA) {
dstKey[dstPosA] = keyA;
dstVal[dstPosA] = valA;
}
if (threadIdx.x < lenA) {
dstKey[dstPosA] = keyA;
dstVal[dstPosA] = valA;
}
if (threadIdx.x < lenB) {
dstKey[dstPosB] = keyB;
dstVal[dstPosB] = valB;
}
if (threadIdx.x < lenB) {
dstKey[dstPosB] = keyB;
dstVal[dstPosB] = valB;
}
}
template <uint sortDir>
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB,
uint stride, uint N) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase;
d_SrcVal += segmentBase;
d_DstKey += segmentBase;
d_DstVal += segmentBase;
// Set up threadblock-wide parameters
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
// Set up threadblock-wide parameters
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = getSampleCount(segmentElementsA);
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
if (threadIdx.x == 0) {
uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = getSampleCount(segmentElementsA);
uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB;
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1]
: segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
startDstA = startSrcA + startSrcB;
startDstB = startDstA + lenSrcA;
}
// Load main input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] =
d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] =
d_SrcVal[stride + startSrcB + threadIdx.x];
}
// Merge data in shared memory
cg::sync(cta);
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
SAMPLE_STRIDE, cta);
// Store merged data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else {
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint batchSize, uint arrayLength,
uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB, uint stride,
uint N, uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void) {
checkCudaErrors(
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}
extern "C" void closeMergeSort(void) {
checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA));
}
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
uint N, uint sortDir) {
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
uint *ikey, *ival, *okey, *oval;
if (stageCount & 1) {
ikey = d_BufKey;
ival = d_BufVal;
okey = d_DstKey;
oval = d_DstVal;
} else {
ikey = d_DstKey;
ival = d_DstVal;
okey = d_BufKey;
oval = d_BufVal;
}
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
checkCudaErrors(cudaMemcpy(
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x];
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
startDstA = startSrcA + startSrcB;
startDstB = startDstA + lenSrcA;
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
// Load main input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
}
// Merge data in shared memory
cg::sync(cta);
merge<sortDir>(s_key,
s_val,
s_key + 0,
s_val + 0,
s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE,
lenSrcA,
SAMPLE_STRIDE,
lenSrcB,
SAMPLE_STRIDE,
cta);
// Store merged data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
static void mergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
mergeElementaryIntervalsKernel<1U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
}
else {
mergeElementaryIntervalsKernel<0U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}
extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void)
{
checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}
extern "C" void closeMergeSort(void)
{
checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA));
}
extern "C" void mergeSort(uint *d_DstKey,
uint *d_DstVal,
uint *d_BufKey,
uint *d_BufVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint N,
uint sortDir)
{
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
uint *ikey, *ival, *okey, *oval;
if (stageCount & 1) {
ikey = d_BufKey;
ival = d_BufVal;
okey = d_DstKey;
oval = d_DstVal;
}
else {
ikey = d_DstKey;
ival = d_DstVal;
okey = d_BufKey;
oval = d_BufVal;
}
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
}

View File

@ -31,19 +31,17 @@
typedef unsigned int uint;
#define SHARED_SIZE_LIMIT 1024U
#define SAMPLE_STRIDE 128
#define SAMPLE_STRIDE 128
////////////////////////////////////////////////////////////////////////////////
// Extensive sort validation routine
////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
uint arrayLength, uint numValues,
uint sortDir);
extern "C" uint
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
extern "C" void fillValues(uint *val, uint N);
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
uint batchSize, uint arrayLength);
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
////////////////////////////////////////////////////////////////////////////////
// CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
extern "C" void closeMergeSort(void);
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey,
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
uint sortDir);
extern "C" void
mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
////////////////////////////////////////////////////////////////////////////////
// CPU "emulation"
////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
uint sortDir);
extern "C" void
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);

View File

@ -29,329 +29,335 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
static void checkOrder(uint *data, uint N, uint sortDir) {
if (N <= 1) {
return;
}
for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) ||
(!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE);
static void checkOrder(uint *data, uint N, uint sortDir)
{
if (N <= 1) {
return;
}
for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE);
}
}
static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
static uint getSampleCount(uint dividend) {
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1)
: (dividend / SAMPLE_STRIDE);
static uint getSampleCount(uint dividend)
{
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
}
static uint nextPowerOfTwo(uint x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
static uint nextPowerOfTwo(uint x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) ||
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
}
}
return pos;
}
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) {
if (L == 0) {
return 0;
}
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) ||
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
{
if (L == 0) {
return 0;
}
}
return pos;
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
}
}
return pos;
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 1: find sample ranks in each segment
////////////////////////////////////////////////////////////////////////////////
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint sampleCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
if (i < nA) {
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] =
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE],
srcKey + segmentBase + stride, lenB, sortDir);
if (i < nA) {
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
}
if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
}
}
if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
binarySearchInclusive(
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
srcKey + segmentBase, lenA, sortDir);
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 2: merge ranks and indices to derive elementary intervals
////////////////////////////////////////////////////////////////////////////////
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride,
uint N) {
uint lastSegmentElements = N % (2 * stride);
uint sampleCount =
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
{
uint lastSegmentElements = N % (2 * stride);
uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
if (i < nA) {
uint dstPosA =
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i],
ranks + (segmentBase + stride) / SAMPLE_STRIDE,
nB, 1) +
i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
if (i < nA) {
uint dstPosA =
binarySearchExclusive(
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
+ i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
}
if (i < nB) {
uint dstPosA =
binarySearchInclusive(
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
+ i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
}
}
if (i < nB) {
uint dstPosA = binarySearchInclusive(
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
////////////////////////////////////////////////////////////////////////////////
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal,
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB,
uint sortDir) {
checkOrder(srcAKey, lenA, sortDir);
checkOrder(srcBKey, lenB, sortDir);
static void merge(uint *dstKey,
uint *dstVal,
uint *srcAKey,
uint *srcAVal,
uint *srcBKey,
uint *srcBVal,
uint lenA,
uint lenB,
uint sortDir)
{
checkOrder(srcAKey, lenA, sortDir);
checkOrder(srcBKey, lenB, sortDir);
for (uint i = 0; i < lenA; i++) {
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcAKey[i];
dstVal[dstPos] = srcAVal[i];
}
for (uint i = 0; i < lenA; i++) {
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcAKey[i];
dstVal[dstPos] = srcAVal[i];
}
for (uint i = 0; i < lenB; i++) {
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcBKey[i];
dstVal[dstPos] = srcBVal[i];
}
for (uint i = 0; i < lenB; i++) {
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcBKey[i];
dstVal[dstPos] = srcBVal[i];
}
}
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey,
uint *srcVal, uint *limitsA, uint *limitsB,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
static void mergeElementaryIntervals(uint *dstKey,
uint *dstVal,
uint *srcKey,
uint *srcVal,
uint *limitsA,
uint *limitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
for (uint pos = 0; pos < mergePairs; pos++) {
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
for (uint pos = 0; pos < mergePairs; pos++) {
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
uint segmentBase = (pos - i) * SAMPLE_STRIDE;
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint n = nA + nB;
const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB);
const uint n = nA + nB;
const uint startPosA = limitsA[pos];
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
const uint startPosB = limitsB[pos];
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
const uint startPosDst = startPosA + startPosB;
const uint startPosA = limitsA[pos];
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
const uint startPosB = limitsB[pos];
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
const uint startPosDst = startPosA + startPosB;
assert(startPosA <= endPosA && endPosA <= lenA);
assert(startPosB <= endPosB && endPosB <= lenB);
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
assert(startPosA <= endPosA && endPosA <= lenA);
assert(startPosB <= endPosB && endPosB <= lenB);
assert((endPosA - startPosA) <= SAMPLE_STRIDE);
assert((endPosB - startPosB) <= SAMPLE_STRIDE);
merge(dstKey + segmentBase + startPosDst,
dstVal + segmentBase + startPosDst,
(srcKey + segmentBase + 0) + startPosA,
(srcVal + segmentBase + 0) + startPosA,
(srcKey + segmentBase + stride) + startPosB,
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA,
endPosB - startPosB, sortDir);
}
merge(dstKey + segmentBase + startPosDst,
dstVal + segmentBase + startPosDst,
(srcKey + segmentBase + 0) + startPosA,
(srcVal + segmentBase + 0) + startPosA,
(srcKey + segmentBase + stride) + startPosB,
(srcVal + segmentBase + stride) + startPosB,
endPosA - startPosA,
endPosB - startPosB,
sortDir);
}
}
////////////////////////////////////////////////////////////////////////////////
// Retarded bubble sort
////////////////////////////////////////////////////////////////////////////////
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) {
if (N <= 1) {
return;
}
for (uint bottom = 0; bottom < N - 1; bottom++) {
uint savePos = bottom;
uint saveKey = key[bottom];
for (uint i = bottom + 1; i < N; i++)
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
savePos = i;
saveKey = key[i];
}
if (savePos != bottom) {
uint t;
t = key[savePos];
key[savePos] = key[bottom];
key[bottom] = t;
t = val[savePos];
val[savePos] = val[bottom];
val[bottom] = t;
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
{
if (N <= 1) {
return;
}
for (uint bottom = 0; bottom < N - 1; bottom++) {
uint savePos = bottom;
uint saveKey = key[bottom];
for (uint i = bottom + 1; i < N; i++)
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
savePos = i;
saveKey = key[i];
}
if (savePos != bottom) {
uint t;
t = key[savePos];
key[savePos] = key[bottom];
key[bottom] = t;
t = val[savePos];
val[savePos] = val[bottom];
val[bottom] = t;
}
}
}
}
////////////////////////////////////////////////////////////////////////////////
// Interface function
////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey,
uint *bufVal, uint *srcKey, uint *srcVal, uint N,
uint sortDir) {
uint *ikey, *ival, *okey, *oval;
uint stageCount = 0;
extern "C" void
mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
{
uint *ikey, *ival, *okey, *oval;
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
if (stageCount & 1) {
ikey = bufKey;
ival = bufVal;
okey = dstKey;
oval = dstVal;
} else {
ikey = dstKey;
ival = dstVal;
okey = bufKey;
oval = bufVal;
}
printf("Bottom-level sort...\n");
memcpy(ikey, srcKey, N * sizeof(uint));
memcpy(ival, srcVal, N * sizeof(uint));
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
sortDir);
}
printf("Merge...\n");
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(limitsA, ranksA, stride, N);
mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint));
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint));
if (stageCount & 1) {
ikey = bufKey;
ival = bufVal;
okey = dstKey;
oval = dstVal;
}
else {
ikey = dstKey;
ival = dstVal;
okey = bufKey;
oval = bufVal;
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
printf("Bottom-level sort...\n");
memcpy(ikey, srcKey, N * sizeof(uint));
memcpy(ival, srcVal, N * sizeof(uint));
free(limitsB);
free(limitsA);
free(ranksB);
free(ranksA);
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
}
printf("Merge...\n");
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(limitsA, ranksA, stride, N);
mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
memcpy(
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
memcpy(
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
free(limitsB);
free(limitsA);
free(ranksB);
free(ranksA);
}

View File

@ -29,104 +29,100 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "mergeSort_common.h"
////////////////////////////////////////////////////////////////////////////////
// Validate sorted keys array (check for integrity and proper order)
////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize,
uint arrayLength, uint numValues,
uint sortDir) {
uint *srcHist;
uint *resHist;
extern "C" uint
validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
{
uint *srcHist;
uint *resHist;
if (arrayLength < 2) {
printf("validateSortedKeys(): arrays too short, exiting...\n");
return 1;
}
printf("...inspecting keys array: ");
srcHist = (uint *)malloc(numValues * sizeof(uint));
resHist = (uint *)malloc(numValues * sizeof(uint));
int flag = 1;
for (uint j = 0; j < batchSize;
j++, srcKey += arrayLength, resKey += arrayLength) {
// Build histograms for keys arrays
memset(srcHist, 0, numValues * sizeof(uint));
memset(resHist, 0, numValues * sizeof(uint));
for (uint i = 0; i < arrayLength; i++) {
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
srcHist[srcKey[i]]++;
resHist[resKey[i]]++;
} else {
fprintf(
stderr,
"***Set %u source/result key arrays are not limited properly***\n",
j);
flag = 0;
goto brk;
}
if (arrayLength < 2) {
printf("validateSortedKeys(): arrays too short, exiting...\n");
return 1;
}
// Compare the histograms
for (uint i = 0; i < numValues; i++)
if (srcHist[i] != resHist[i]) {
fprintf(stderr,
"***Set %u source/result keys histograms do not match***\n", j);
flag = 0;
goto brk;
}
printf("...inspecting keys array: ");
srcHist = (uint *)malloc(numValues * sizeof(uint));
resHist = (uint *)malloc(numValues * sizeof(uint));
// Finally check the ordering
for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) ||
(!sortDir && (resKey[i] < resKey[i + 1]))) {
fprintf(stderr,
"***Set %u result key array is not ordered properly***\n", j);
flag = 0;
goto brk;
}
}
int flag = 1;
for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
// Build histograms for keys arrays
memset(srcHist, 0, numValues * sizeof(uint));
memset(resHist, 0, numValues * sizeof(uint));
for (uint i = 0; i < arrayLength; i++) {
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
srcHist[srcKey[i]]++;
resHist[resKey[i]]++;
}
else {
fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
flag = 0;
goto brk;
}
}
// Compare the histograms
for (uint i = 0; i < numValues; i++)
if (srcHist[i] != resHist[i]) {
fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
flag = 0;
goto brk;
}
// Finally check the ordering
for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
flag = 0;
goto brk;
}
}
brk:
free(resHist);
free(srcHist);
free(resHist);
free(srcHist);
if (flag) printf("OK\n");
if (flag)
printf("OK\n");
return flag;
return flag;
}
////////////////////////////////////////////////////////////////////////////////
// Value validation / stability check routines
////////////////////////////////////////////////////////////////////////////////
extern "C" void fillValues(uint *val, uint N) {
for (uint i = 0; i < N; i++) val[i] = i;
extern "C" void fillValues(uint *val, uint N)
{
for (uint i = 0; i < N; i++)
val[i] = i;
}
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey,
uint batchSize, uint arrayLength) {
int correctFlag = 1, stableFlag = 1;
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
{
int correctFlag = 1, stableFlag = 1;
printf("...inspecting keys and values array: ");
printf("...inspecting keys and values array: ");
for (uint i = 0; i < batchSize;
i++, resKey += arrayLength, resVal += arrayLength) {
for (uint j = 0; j < arrayLength; j++) {
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0;
for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
for (uint j = 0; j < arrayLength; j++) {
if (resKey[j] != srcKey[resVal[j]])
correctFlag = 0;
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) &&
(resVal[j] > resVal[j + 1]))
stableFlag = 0;
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
stableFlag = 0;
}
}
}
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
printf(stableFlag ? "...stability property: stable!\n"
: "...stability property: NOT stable\n");
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
return correctFlag;
return correctFlag;
}

View File

@ -29,106 +29,105 @@
#include <stdio.h>
// Includes CUDA
#include <cuda_runtime.h>
#include <cuda/barrier>
#include <cooperative_groups.h>
#include <cuda/barrier>
#include <cuda_runtime.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check
#include <helper_cuda.h> // helper functions for CUDA error check
namespace cg = cooperative_groups;
#if __CUDA_ARCH__ >= 700
template <bool writeSquareRoot>
__device__ void reduceBlockData(
cuda::barrier<cuda::thread_scope_block> &barrier,
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) {
extern __shared__ double tmp[];
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
threadSum += tile32.shfl_down(threadSum, offset);
}
if (tile32.thread_rank() == 0) {
tmp[tile32.meta_group_rank()] = threadSum;
}
auto token = barrier.arrive();
barrier.wait(std::move(token));
// The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size()
? tmp[tile32.thread_rank()]
: 0.0;
__device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
cg::thread_block_tile<32> &tile32,
double &threadSum,
double *result)
{
extern __shared__ double tmp[];
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
beta += tile32.shfl_down(beta, offset);
threadSum += tile32.shfl_down(threadSum, offset);
}
if (tile32.thread_rank() == 0) {
tmp[tile32.meta_group_rank()] = threadSum;
}
if (tile32.thread_rank() == 0) {
if (writeSquareRoot)
*result = sqrt(beta);
else
*result = beta;
auto token = barrier.arrive();
barrier.wait(std::move(token));
// The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
beta += tile32.shfl_down(beta, offset);
}
if (tile32.thread_rank() == 0) {
if (writeSquareRoot)
*result = sqrt(beta);
else
*result = beta;
}
}
}
}
#endif
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB,
double *partialResults, int size) {
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
{
#if __CUDA_ARCH__ >= 700
#pragma diag_suppress static_var_with_dynamic_init
cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
;
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid();
;
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
__shared__ cuda::barrier<cuda::thread_scope_block> barrier;
if (threadIdx.x == 0) {
init(&barrier, blockDim.x);
}
cg::sync(cta);
double threadSum = 0.0;
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
threadSum += (double)(vecA[i] * vecB[i]);
}
// Each thread block performs reduction of partial dotProducts and writes to
// global mem.
reduceBlockData<false>(barrier, tile32, threadSum,
&partialResults[blockIdx.x]);
cg::sync(grid);
// One block performs the final summation of partial dot products
// of all the thread blocks and writes the sqrt of final dot product.
if (blockIdx.x == 0) {
threadSum = 0.0;
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
threadSum += partialResults[i];
if (threadIdx.x == 0) {
init(&barrier, blockDim.x);
}
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
}
cg::sync(grid);
cg::sync(cta);
const double finalValue = partialResults[0];
double threadSum = 0.0;
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
threadSum += (double)(vecA[i] * vecB[i]);
}
// Perform normalization of vecA & vecB.
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
vecA[i] = (float)vecA[i] / finalValue;
vecB[i] = (float)vecB[i] / finalValue;
}
// Each thread block performs reduction of partial dotProducts and writes to
// global mem.
reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
cg::sync(grid);
// One block performs the final summation of partial dot products
// of all the thread blocks and writes the sqrt of final dot product.
if (blockIdx.x == 0) {
threadSum = 0.0;
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
threadSum += partialResults[i];
}
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
}
cg::sync(grid);
const double finalValue = partialResults[0];
// Perform normalization of vecA & vecB.
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
vecA[i] = (float)vecA[i] / finalValue;
vecB[i] = (float)vecB[i] / finalValue;
}
#endif
}
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", argv[0]);
int main(int argc, char **argv)
{
printf("%s starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
// This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv);
int major = 0;
checkCudaErrors(
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
int major = 0;
checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
if (major < 7) {
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
exit(EXIT_WAIVED);
}
int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
cudaDevAttrCooperativeLaunch, dev));
if (!supportsCooperativeLaunch) {
printf(
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"Waiving the run\n",
dev);
exit(EXIT_WAIVED);
}
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
float *vecA, *d_vecA;
float *vecB, *d_vecB;
double *d_partialResults;
int size = 10000000;
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
float baseVal = 2.0;
for (int i = 0; i < size; i++) {
vecA[i] = vecB[i] = baseVal;
}
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0;
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
int numBlocksPerSm = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute(
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf(
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n",
minGridSize, blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
(void *)&d_partialResults, (void *)&size};
checkCudaErrors(
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
unsigned int matches = 0;
for (int i = 0; i < size; i++) {
if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i);
break;
} else {
matches++;
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
if (major < 7) {
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
exit(EXIT_WAIVED);
}
}
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
checkCudaErrors(cudaFree(d_vecA));
checkCudaErrors(cudaFree(d_vecB));
checkCudaErrors(cudaFree(d_partialResults));
int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
checkCudaErrors(cudaFreeHost(vecA));
checkCudaErrors(cudaFreeHost(vecB));
return matches == size;
if (!supportsCooperativeLaunch) {
printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"Waiving the run\n",
dev);
exit(EXIT_WAIVED);
}
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
{
float *vecA, *d_vecA;
float *vecB, *d_vecB;
double *d_partialResults;
int size = 10000000;
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
float baseVal = 2.0;
for (int i = 0; i < size; i++) {
vecA[i] = vecB[i] = baseVal;
}
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0;
checkCudaErrors(
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
int numBlocksPerSm = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n",
minGridSize,
blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
checkCudaErrors(cudaLaunchCooperativeKernel(
(void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
unsigned int matches = 0;
for (int i = 0; i < size; i++) {
if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i);
break;
}
else {
matches++;
}
}
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
checkCudaErrors(cudaFree(d_vecA));
checkCudaErrors(cudaFree(d_vecB));
checkCudaErrors(cudaFree(d_partialResults));
checkCudaErrors(cudaFreeHost(vecA));
checkCudaErrors(cudaFreeHost(vecB));
return matches == size;
}

View File

@ -34,17 +34,17 @@
#endif
// Includes, system
#include <stdio.h>
#include <cassert>
#include <stdio.h>
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check
#include <helper_cuda.h> // helper functions for CUDA error check
const char *sampleName = "simpleAssert";
@ -58,9 +58,10 @@ bool testResult = true;
//! Tests assert function.
//! Thread whose id > N will print assertion failed error message.
////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int N) {
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
__global__ void testKernel(int N)
{
int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
}
////////////////////////////////////////////////////////////////////////////////
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) {
printf("%s starting...\n", sampleName);
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
runTest(argc, argv);
runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName,
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");