Apply consistent code formatting across the repo. Add clang-format and pre-commit hooks.

This commit is contained in:
Rob Armstrong 2025-03-27 10:30:07 -07:00
parent 2cd58fbc9a
commit ceab6e8bcc
782 changed files with 107230 additions and 106548 deletions

49
.clang-format Normal file
View File

@ -0,0 +1,49 @@
---
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: Consecutive
AlignConsecutiveDeclarations: Consecutive
AlignConsecutiveMacros: Consecutive
AlignEscapedNewlines: Left
AlignOperands: AlignAfterOperator
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: false
BinPackArguments: false
BinPackParameters: false
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterExternBlock: true
AfterFunction: true
AfterStruct: true
AfterUnion: true
BeforeCatch: true
BeforeElse: true
IndentBraces: false
BreakBeforeBraces: Custom
BreakBeforeConceptDeclarations: true
BreakBeforeBinaryOperators: NonAssignment
BreakBeforeTernaryOperators: true
BreakConstructorInitializers: BeforeComma
BreakInheritanceList: BeforeComma
ColumnLimit: 120
DerivePointerAlignment: false
FixNamespaceComments: true
IncludeCategories:
- Regex: '^<.*>'
Priority: 1
- Regex: '^".*"'
Priority: 2
SortIncludes: true
IncludeBlocks: Regroup
IndentWidth: 4
MaxEmptyLinesToKeep: 2
PointerAlignment: Right
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
Standard: c++17
TabWidth: 4
UseTab: Never
...

100
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,100 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
ci:
autofix_commit_msg: |
[pre-commit.ci] auto code formatting
autofix_prs: false
autoupdate_branch: ''
autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate'
autoupdate_schedule: quarterly
skip: []
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: end-of-file-fixer
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- id: mixed-line-ending
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- id: trailing-whitespace
exclude: |
(?x)^(
.*\.raw$|
.*\.bin$|
.*\.dat$|
.*\.nv12$|
data/.*|
Common/.*
)
files: |
(?x)^(
.*\.txt$|
.*\.md$|
.*\.cpp$|
.*\.cxx$|
.*\.hpp$|
.*\.h$|
.*\.cu$|
.*\.cuh$
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.6
hooks:
- id: clang-format
types_or: [file]
files: |
(?x)^(
^.*\.c$|
^.*\.cpp$|
^.*\.cu$|
^.*\.cuh$|
^.*\.cxx$|
^.*\.h$|
^.*\.hpp$|
^.*\.inl$|
^.*\.mm$
)
exclude: |
(?x)^(
Common/.*
)
args: ["-fallback-style=none", "-style=file", "-i"]

View File

@ -31,10 +31,10 @@
*/ */
// system includes // system includes
#include <algorithm>
#include <cstdio> #include <cstdio>
#include <ctime> #include <ctime>
#include <vector> #include <vector>
#include <algorithm>
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
#include <pthread.h> #include <pthread.h>
#else #else
@ -51,291 +51,287 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// SRAND48 and DRAND48 don't exist on windows, but these are the equivalent // SRAND48 and DRAND48 don't exist on windows, but these are the equivalent
// functions // functions
void srand48(long seed) { srand((unsigned int)seed); } void srand48(long seed) { srand((unsigned int)seed); }
double drand48() { return double(rand()) / RAND_MAX; } double drand48() { return double(rand()) / RAND_MAX; }
#endif #endif
const char *sSDKname = "UnifiedMemoryStreams"; const char *sSDKname = "UnifiedMemoryStreams";
// simple task // simple task
template <typename T> template <typename T> struct Task
struct Task { {
unsigned int size, id; unsigned int size, id;
T *data; T *data;
T *result; T *result;
T *vector; T *vector;
Task() : size(0), id(0), data(NULL), result(NULL), vector(NULL){}; Task()
Task(unsigned int s) : size(s), id(0), data(NULL), result(NULL) { : size(0)
// allocate unified memory -- the operation performed in this example will , id(0)
// be a DGEMV , data(NULL)
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size)); , result(NULL)
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size)); , vector(NULL) {};
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size)); Task(unsigned int s)
checkCudaErrors(cudaDeviceSynchronize()); : size(s)
} , id(0)
, data(NULL)
~Task() { , result(NULL)
// ensure all memory is deallocated {
checkCudaErrors(cudaDeviceSynchronize()); // allocate unified memory -- the operation performed in this example will
checkCudaErrors(cudaFree(data)); // be a DGEMV
checkCudaErrors(cudaFree(result)); checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaFree(vector)); checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
} checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
void allocate(const unsigned int s, const unsigned int unique_id) {
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
} }
for (unsigned int i = 0; i < size; i++) { ~Task()
result[i] = 0.; {
vector[i] = drand48(); // ensure all memory is deallocated
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaFree(data));
checkCudaErrors(cudaFree(result));
checkCudaErrors(cudaFree(vector));
}
void allocate(const unsigned int s, const unsigned int unique_id)
{
// allocate unified memory outside of constructor
id = unique_id;
size = s;
checkCudaErrors(cudaMallocManaged(&data, sizeof(T) * size * size));
checkCudaErrors(cudaMallocManaged(&result, sizeof(T) * size));
checkCudaErrors(cudaMallocManaged(&vector, sizeof(T) * size));
checkCudaErrors(cudaDeviceSynchronize());
// populate data with random elements
for (unsigned int i = 0; i < size * size; i++) {
data[i] = drand48();
}
for (unsigned int i = 0; i < size; i++) {
result[i] = 0.;
vector[i] = drand48();
}
} }
}
}; };
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
struct threadData_t { struct threadData_t
int tid; {
Task<double> *TaskListPtr; int tid;
cudaStream_t *streams; Task<double> *TaskListPtr;
cublasHandle_t *handles; cudaStream_t *streams;
int taskSize; cublasHandle_t *handles;
int taskSize;
}; };
typedef struct threadData_t threadData; typedef struct threadData_t threadData;
#endif #endif
// simple host dgemv: assume data is in row-major format and square // simple host dgemv: assume data is in row-major format and square
template <typename T> template <typename T> void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result)
void gemv(int m, int n, T alpha, T *A, T *x, T beta, T *result) { {
// rows // rows
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
result[i] *= beta; result[i] *= beta;
for (int j = 0; j < n; j++) { for (int j = 0; j < n; j++) {
result[i] += A[i * n + j] * x[j]; result[i] += A[i * n + j] * x[j];
}
} }
}
} }
// execute a single task on either host or device depending on size // execute a single task on either host or device depending on size
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
void *execute(void *inpArgs) { void *execute(void *inpArgs)
threadData *dataPtr = (threadData *)inpArgs; {
cudaStream_t *stream = dataPtr->streams; threadData *dataPtr = (threadData *)inpArgs;
cublasHandle_t *handle = dataPtr->handles; cudaStream_t *stream = dataPtr->streams;
int tid = dataPtr->tid; cublasHandle_t *handle = dataPtr->handles;
int tid = dataPtr->tid;
for (int i = 0; i < dataPtr->taskSize; i++) { for (int i = 0; i < dataPtr->taskSize; i++) {
Task<double> &t = dataPtr->TaskListPtr[i]; Task<double> &t = dataPtr->TaskListPtr[i];
if (t.size < 100) { if (t.size < 100) {
// perform on host // perform on host
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
t.size);
// attach managed memory to a (dummy) stream to allow host access while // attach managed memory to a (dummy) stream to allow host access while
// the device is running // the device is running
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors( checkCudaErrors(cudaStreamSynchronize(stream[0]));
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); // call the host operation
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
checkCudaErrors(cudaStreamSynchronize(stream[0])); }
// call the host operation else {
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); // perform on device
} else { printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
// perform on device double one = 1.0;
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, double zero = 0.0;
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream // attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); // call the device operation
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, checkCudaErrors(cublasDgemv(
cudaMemAttachSingle)); handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
// call the device operation }
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
} }
}
pthread_exit(NULL); pthread_exit(NULL);
} }
#else #else
template <typename T> template <typename T> void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, int tid)
void execute(Task<T> &t, cublasHandle_t *handle, cudaStream_t *stream, {
int tid) { if (t.size < 100) {
if (t.size < 100) { // perform on host
// perform on host printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid, t.size);
printf("Task [%d], thread [%d] executing on host (%d)\n", t.id, tid,
t.size);
// attach managed memory to a (dummy) stream to allow host access while the // attach managed memory to a (dummy) stream to allow host access while the
// device is running // device is running
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.data, 0, cudaMemAttachHost)); checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost));
checkCudaErrors( checkCudaErrors(cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost));
cudaStreamAttachMemAsync(stream[0], t.vector, 0, cudaMemAttachHost)); // necessary to ensure Async cudaStreamAttachMemAsync calls have finished
checkCudaErrors( checkCudaErrors(cudaStreamSynchronize(stream[0]));
cudaStreamAttachMemAsync(stream[0], t.result, 0, cudaMemAttachHost)); // call the host operation
// necessary to ensure Async cudaStreamAttachMemAsync calls have finished gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result);
checkCudaErrors(cudaStreamSynchronize(stream[0])); }
// call the host operation else {
gemv(t.size, t.size, 1.0, t.data, t.vector, 0.0, t.result); // perform on device
} else { printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, t.size);
// perform on device double one = 1.0;
printf("Task [%d], thread [%d] executing on device (%d)\n", t.id, tid, double zero = 0.0;
t.size);
double one = 1.0;
double zero = 0.0;
// attach managed memory to my stream // attach managed memory to my stream
checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1])); checkCudaErrors(cublasSetStream(handle[tid + 1], stream[tid + 1]));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.data, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, cudaMemAttachSingle));
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.vector, 0, checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, cudaMemAttachSingle));
cudaMemAttachSingle)); // call the device operation
checkCudaErrors(cudaStreamAttachMemAsync(stream[tid + 1], t.result, 0, checkCudaErrors(cublasDgemv(
cudaMemAttachSingle)); handle[tid + 1], CUBLAS_OP_N, t.size, t.size, &one, t.data, t.size, t.vector, 1, &zero, t.result, 1));
// call the device operation }
checkCudaErrors(cublasDgemv(handle[tid + 1], CUBLAS_OP_N, t.size, t.size,
&one, t.data, t.size, t.vector, 1, &zero,
t.result, 1));
}
} }
#endif #endif
// populate a list of tasks with random sizes // populate a list of tasks with random sizes
template <typename T> template <typename T> void initialise_tasks(std::vector<Task<T>> &TaskList)
void initialise_tasks(std::vector<Task<T> > &TaskList) { {
for (unsigned int i = 0; i < TaskList.size(); i++) { for (unsigned int i = 0; i < TaskList.size(); i++) {
// generate random size // generate random size
int size; int size;
size = std::max((int)(drand48() * 1000.0), 64); size = std::max((int)(drand48() * 1000.0), 64);
TaskList[i].allocate(size, i); TaskList[i].allocate(size, i);
} }
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
// set device {
cudaDeviceProp device_prop; // set device
int dev_id = findCudaDevice(argc, (const char **)argv); cudaDeviceProp device_prop;
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) { if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory // This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n"); fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
if (device_prop.computeMode == cudaComputeModeProhibited) { if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode // This sample requires being run with a default or process exclusive mode
fprintf(stderr, fprintf(stderr,
"This sample requires a device in either default or process " "This sample requires a device in either default or process "
"exclusive mode\n"); "exclusive mode\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
// randomise task sizes // randomise task sizes
int seed = (int)time(NULL); int seed = (int)time(NULL);
srand48(seed); srand48(seed);
// set number of threads // set number of threads
const int nthreads = 4; const int nthreads = 4;
// number of streams = number of threads // number of streams = number of threads
cudaStream_t *streams = new cudaStream_t[nthreads + 1]; cudaStream_t *streams = new cudaStream_t[nthreads + 1];
cublasHandle_t *handles = new cublasHandle_t[nthreads + 1]; cublasHandle_t *handles = new cublasHandle_t[nthreads + 1];
for (int i = 0; i < nthreads + 1; i++) { for (int i = 0; i < nthreads + 1; i++) {
checkCudaErrors(cudaStreamCreate(&streams[i])); checkCudaErrors(cudaStreamCreate(&streams[i]));
checkCudaErrors(cublasCreate(&handles[i])); checkCudaErrors(cublasCreate(&handles[i]));
} }
// create list of N tasks // create list of N tasks
unsigned int N = 40; unsigned int N = 40;
std::vector<Task<double> > TaskList(N); std::vector<Task<double>> TaskList(N);
initialise_tasks(TaskList); initialise_tasks(TaskList);
printf("Executing tasks on host / device\n"); printf("Executing tasks on host / device\n");
// run through all tasks using threads and streams // run through all tasks using threads and streams
#ifdef USE_PTHREADS #ifdef USE_PTHREADS
pthread_t threads[nthreads]; pthread_t threads[nthreads];
threadData *InputToThreads = new threadData[nthreads]; threadData *InputToThreads = new threadData[nthreads];
for (int i = 0; i < nthreads; i++) { for (int i = 0; i < nthreads; i++) {
checkCudaErrors(cudaSetDevice(dev_id)); checkCudaErrors(cudaSetDevice(dev_id));
InputToThreads[i].tid = i; InputToThreads[i].tid = i;
InputToThreads[i].streams = streams; InputToThreads[i].streams = streams;
InputToThreads[i].handles = handles; InputToThreads[i].handles = handles;
if ((TaskList.size() / nthreads) == 0) { if ((TaskList.size() / nthreads) == 0) {
InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].TaskListPtr = InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
&TaskList[i * (TaskList.size() / nthreads)]; }
} else { else {
if (i == nthreads - 1) { if (i == nthreads - 1) {
InputToThreads[i].taskSize = InputToThreads[i].taskSize = (TaskList.size() / nthreads) + (TaskList.size() % nthreads);
(TaskList.size() / nthreads) + (TaskList.size() % nthreads); InputToThreads[i].TaskListPtr =
InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads) + (TaskList.size() % nthreads)];
&TaskList[i * (TaskList.size() / nthreads) + }
(TaskList.size() % nthreads)]; else {
} else { InputToThreads[i].taskSize = (TaskList.size() / nthreads);
InputToThreads[i].taskSize = (TaskList.size() / nthreads); InputToThreads[i].TaskListPtr = &TaskList[i * (TaskList.size() / nthreads)];
InputToThreads[i].TaskListPtr = }
&TaskList[i * (TaskList.size() / nthreads)]; }
}
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
} }
pthread_create(&threads[i], NULL, &execute, &InputToThreads[i]);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
#else #else
omp_set_num_threads(nthreads); omp_set_num_threads(nthreads);
#pragma omp parallel for schedule(dynamic) #pragma omp parallel for schedule(dynamic)
for (int i = 0; i < TaskList.size(); i++) { for (int i = 0; i < TaskList.size(); i++) {
checkCudaErrors(cudaSetDevice(dev_id)); checkCudaErrors(cudaSetDevice(dev_id));
int tid = omp_get_thread_num(); int tid = omp_get_thread_num();
execute(TaskList[i], handles, streams, tid); execute(TaskList[i], handles, streams, tid);
} }
#endif #endif
cudaDeviceSynchronize(); cudaDeviceSynchronize();
// Destroy CUDA Streams, cuBlas handles // Destroy CUDA Streams, cuBlas handles
for (int i = 0; i < nthreads + 1; i++) { for (int i = 0; i < nthreads + 1; i++) {
cudaStreamDestroy(streams[i]); cudaStreamDestroy(streams[i]);
cublasDestroy(handles[i]); cublasDestroy(handles[i]);
} }
// Free TaskList // Free TaskList
std::vector<Task<double> >().swap(TaskList); std::vector<Task<double>>().swap(TaskList);
printf("All Done!\n"); printf("All Done!\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }

View File

@ -38,105 +38,107 @@
#include <stdio.h> #include <stdio.h>
// includes CUDA Runtime // includes CUDA Runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper utility functions #include <helper_functions.h> // helper utility functions
__global__ void increment_kernel(int *g_data, int inc_value) { __global__ void increment_kernel(int *g_data, int inc_value)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
g_data[idx] = g_data[idx] + inc_value; int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
} }
bool correct_output(int *data, const int n, const int x) { bool correct_output(int *data, const int n, const int x)
for (int i = 0; i < n; i++) {
if (data[i] != x) { for (int i = 0; i < n; i++)
printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x); if (data[i] != x) {
return false; printf("Error! data[%d] = %d, ref = %d\n", i, data[i], x);
return false;
}
return true;
}
int main(int argc, char *argv[])
{
int devID;
cudaDeviceProp deviceProps;
printf("[%s] - Starting...\n", argv[0]);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv);
// get device name
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
} }
return true; checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
}
// print the cpu and gpu times
int main(int argc, char *argv[]) { printf("time spent executing by the GPU: %.2f\n", gpu_time);
int devID; printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
cudaDeviceProp deviceProps; printf("CPU executed %lu iterations while waiting for GPU to finish\n", counter);
printf("[%s] - Starting...\n", argv[0]); // check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv); // release resources
checkCudaErrors(cudaEventDestroy(start));
// get device name checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaFreeHost(a));
printf("CUDA device [%s]\n", deviceProps.name); checkCudaErrors(cudaFree(d_a));
int n = 16 * 1024 * 1024; exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
int nbytes = n * sizeof(int);
int value = 26;
// allocate host memory
int *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, nbytes));
memset(a, 0, nbytes);
// allocate device memory
int *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 255, nbytes));
// set kernel launch configuration
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0.0f;
// asynchronously issue work to the GPU (all to stream 0)
checkCudaErrors(cudaProfilerStart());
sdkStartTimer(&timer);
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
sdkStopTimer(&timer);
checkCudaErrors(cudaProfilerStop());
// have CPU do some work while waiting for stage 1 to finish
unsigned long int counter = 0;
while (cudaEventQuery(stop) == cudaErrorNotReady) {
counter++;
}
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// print the cpu and gpu times
printf("time spent executing by the GPU: %.2f\n", gpu_time);
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
printf("CPU executed %lu iterations while waiting for GPU to finish\n",
counter);
// check the output for correctness
bool bFinalResults = correct_output(a, n, value);
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFree(d_a));
exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -48,43 +48,46 @@
// This kernel computes a standard parallel reduction and evaluates the // This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored // time it takes to do that for each block. The timing results are stored
// in device memory. // in device memory.
__global__ static void timedReduction(const float *input, float *output, __global__ static void timedReduction(const float *input, float *output, clock_t *timer)
clock_t *timer) { {
// __shared__ float shared[2 * blockDim.x]; // __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[]; extern __shared__ float shared[];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock(); if (tid == 0)
timer[bid] = clock();
// Copy input. // Copy input.
shared[tid] = input[tid]; shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x]; shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads(); __syncthreads();
if (tid < d) { if (tid == 0)
float f0 = shared[tid]; timer[bid + gridDim.x] = clock();
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
} }
#define NUM_BLOCKS 64 #define NUM_BLOCKS 64
#define NUM_THREADS 256 #define NUM_THREADS 256
// It's interesting to change the number of blocks and the number of threads to // It's interesting to change the number of blocks and the number of threads to
@ -104,50 +107,46 @@ __global__ static void timedReduction(const float *input, float *output,
// the memory. With more than 32 the speed scales linearly. // the memory. With more than 32 the speed scales linearly.
// Start the main CUDA Sample here // Start the main CUDA Sample here
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("CUDA Clock sample\n"); {
printf("CUDA Clock sample\n");
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv); int dev = findCudaDevice(argc, (const char **)argv);
float *dinput = NULL; float *dinput = NULL;
float *doutput = NULL; float *doutput = NULL;
clock_t *dtimer = NULL; clock_t *dtimer = NULL;
clock_t timer[NUM_BLOCKS * 2]; clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2]; float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) { for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i; input[i] = (float)i;
} }
checkCudaErrors( checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2));
cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(
cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>( timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
dinput, doutput, dtimer);
checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dinput)); checkCudaErrors(cudaFree(dinput));
checkCudaErrors(cudaFree(doutput)); checkCudaErrors(cudaFree(doutput));
checkCudaErrors(cudaFree(dtimer)); checkCudaErrors(cudaFree(dtimer));
long double avgElapsedClocks = 0; long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) { for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
} }
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks); printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -34,12 +34,11 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <stdint.h>
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <nvrtc_helper.h> #include <nvrtc_helper.h>
#include <stdint.h>
#include <stdio.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
@ -71,64 +70,68 @@
// Start the main CUDA Sample here // Start the main CUDA Sample here
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("CUDA Clock sample\n"); {
printf("CUDA Clock sample\n");
typedef long clock_t; typedef long clock_t;
clock_t timer[NUM_BLOCKS * 2]; clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2]; float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++) { for (int i = 0; i < NUM_THREADS * 2; i++) {
input[i] = (float)i; input[i] = (float)i;
} }
char *cubin, *kernel_file; char *cubin, *kernel_file;
size_t cubinSize; size_t cubinSize;
kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]); kernel_file = sdkFindFilePath("clock_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv); CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr; CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction")); checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "timedReduction"));
dim3 cudaBlockSize(NUM_THREADS, 1, 1); dim3 cudaBlockSize(NUM_THREADS, 1, 1);
dim3 cudaGridSize(NUM_BLOCKS, 1, 1); dim3 cudaGridSize(NUM_BLOCKS, 1, 1);
CUdeviceptr dinput, doutput, dtimer; CUdeviceptr dinput, doutput, dtimer;
checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2)); checkCudaErrors(cuMemAlloc(&dinput, sizeof(float) * NUM_THREADS * 2));
checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS)); checkCudaErrors(cuMemAlloc(&doutput, sizeof(float) * NUM_BLOCKS));
checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); checkCudaErrors(cuMemAlloc(&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2)); checkCudaErrors(cuMemcpyHtoD(dinput, input, sizeof(float) * NUM_THREADS * 2));
void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer}; void *arr[] = {(void *)&dinput, (void *)&doutput, (void *)&dtimer};
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(kernel_addr,
kernel_addr, cudaGridSize.x, cudaGridSize.y, cudaGridSize.x,
cudaGridSize.z, /* grid dim */ cudaGridSize.y,
cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */ cudaGridSize.z, /* grid dim */
sizeof(float) * 2 * NUM_THREADS, 0, /* shared mem, stream */ cudaBlockSize.x,
&arr[0], /* arguments */ cudaBlockSize.y,
0)); cudaBlockSize.z, /* block dim */
sizeof(float) * 2 * NUM_THREADS,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
checkCudaErrors( checkCudaErrors(cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
cuMemcpyDtoH(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); checkCudaErrors(cuMemFree(dinput));
checkCudaErrors(cuMemFree(dinput)); checkCudaErrors(cuMemFree(doutput));
checkCudaErrors(cuMemFree(doutput)); checkCudaErrors(cuMemFree(dtimer));
checkCudaErrors(cuMemFree(dtimer));
long double avgElapsedClocks = 0; long double avgElapsedClocks = 0;
for (int i = 0; i < NUM_BLOCKS; i++) { for (int i = 0; i < NUM_BLOCKS; i++) {
avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]); avgElapsedClocks += (long double)(timer[i + NUM_BLOCKS] - timer[i]);
} }
avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS; avgElapsedClocks = avgElapsedClocks / NUM_BLOCKS;
printf("Average clocks/block = %Lf\n", avgElapsedClocks); printf("Average clocks/block = %Lf\n", avgElapsedClocks);
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -37,38 +37,41 @@
// time it takes to do that for each block. The timing results are stored // time it takes to do that for each block. The timing results are stored
// in device memory. // in device memory.
extern "C" __global__ void timedReduction(const float *input, float *output, extern "C" __global__ void timedReduction(const float *input, float *output, clock_t *timer)
clock_t *timer) { {
// __shared__ float shared[2 * blockDim.x]; // __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[]; extern __shared__ float shared[];
const int tid = threadIdx.x; const int tid = threadIdx.x;
const int bid = blockIdx.x; const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock(); if (tid == 0)
timer[bid] = clock();
// Copy input. // Copy input.
shared[tid] = input[tid]; shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x]; shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads();
if (tid < d) {
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0)
output[bid] = shared[0];
// Perform reduction to find minimum.
for (int d = blockDim.x; d > 0; d /= 2) {
__syncthreads(); __syncthreads();
if (tid < d) { if (tid == 0)
float f0 = shared[tid]; timer[bid + gridDim.x] = clock();
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid + gridDim.x] = clock();
} }

View File

@ -32,128 +32,125 @@
#include <helper_cuda.h> #include <helper_cuda.h>
#include <omp.h> #include <omp.h>
#include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe #include <stdio.h> // stdio functions are used since C++ streams aren't necessarily thread safe
using namespace std; using namespace std;
// a simple kernel that simply increments each array element by b // a simple kernel that simply increments each array element by b
__global__ void kernelAddConstant(int *g_a, const int b) { __global__ void kernelAddConstant(int *g_a, const int b)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
g_a[idx] += b; int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_a[idx] += b;
} }
// a predicate that checks whether each array element is set to its index plus b // a predicate that checks whether each array element is set to its index plus b
int correctResult(int *data, const int n, const int b) { int correctResult(int *data, const int n, const int b)
for (int i = 0; i < n; i++) {
if (data[i] != i + b) return 0; for (int i = 0; i < n; i++)
if (data[i] != i + b)
return 0;
return 1; return 1;
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
int num_gpus = 0; // number of CUDA GPUs {
int num_gpus = 0; // number of CUDA GPUs
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// determine the number of CUDA capable GPUs // determine the number of CUDA capable GPUs
// //
cudaGetDeviceCount(&num_gpus); cudaGetDeviceCount(&num_gpus);
if (num_gpus < 1) { if (num_gpus < 1) {
printf("no CUDA capable devices were detected\n"); printf("no CUDA capable devices were detected\n");
return 1; return 1;
} }
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// display CPU and GPU configuration // display CPU and GPU configuration
// //
printf("number of host CPUs:\t%d\n", omp_get_num_procs()); printf("number of host CPUs:\t%d\n", omp_get_num_procs());
printf("number of CUDA devices:\t%d\n", num_gpus); printf("number of CUDA devices:\t%d\n", num_gpus);
for (int i = 0; i < num_gpus; i++) { for (int i = 0; i < num_gpus; i++) {
cudaDeviceProp dprop; cudaDeviceProp dprop;
cudaGetDeviceProperties(&dprop, i); cudaGetDeviceProperties(&dprop, i);
printf(" %d: %s\n", i, dprop.name); printf(" %d: %s\n", i, dprop.name);
} }
printf("---------------------------\n"); printf("---------------------------\n");
///////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////
// initialize data // initialize data
// //
unsigned int n = num_gpus * 8192; unsigned int n = num_gpus * 8192;
unsigned int nbytes = n * sizeof(int); unsigned int nbytes = n * sizeof(int);
int *a = 0; // pointer to data on the CPU int *a = 0; // pointer to data on the CPU
int b = 3; // value by which the array is incremented int b = 3; // value by which the array is incremented
a = (int *)malloc(nbytes); a = (int *)malloc(nbytes);
if (0 == a) { if (0 == a) {
printf("couldn't allocate CPU memory\n"); printf("couldn't allocate CPU memory\n");
return 1; return 1;
} }
for (unsigned int i = 0; i < n; i++) a[i] = i; for (unsigned int i = 0; i < n; i++)
a[i] = i;
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// run as many CPU threads as there are CUDA devices // run as many CPU threads as there are CUDA devices
// each CPU thread controls a different device, processing its // each CPU thread controls a different device, processing its
// portion of the data. It's possible to use more CPU threads // portion of the data. It's possible to use more CPU threads
// than there are CUDA devices, in which case several CPU // than there are CUDA devices, in which case several CPU
// threads will be allocating resources and launching kernels // threads will be allocating resources and launching kernels
// on the same device. For example, try omp_set_num_threads(2*num_gpus); // on the same device. For example, try omp_set_num_threads(2*num_gpus);
// Recall that all variables declared inside an "omp parallel" scope are // Recall that all variables declared inside an "omp parallel" scope are
// local to each CPU thread // local to each CPU thread
// //
omp_set_num_threads( omp_set_num_threads(num_gpus); // create as many CPU threads as there are CUDA devices
num_gpus); // create as many CPU threads as there are CUDA devices
// omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there // omp_set_num_threads(2*num_gpus);// create twice as many CPU threads as there
// are CUDA devices // are CUDA devices
#pragma omp parallel #pragma omp parallel
{ {
unsigned int cpu_thread_id = omp_get_thread_num(); unsigned int cpu_thread_id = omp_get_thread_num();
unsigned int num_cpu_threads = omp_get_num_threads(); unsigned int num_cpu_threads = omp_get_num_threads();
// set and check the CUDA device for this CPU thread // set and check the CUDA device for this CPU thread
int gpu_id = -1; int gpu_id = -1;
checkCudaErrors(cudaSetDevice( checkCudaErrors(
cpu_thread_id % cudaSetDevice(cpu_thread_id % num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices
num_gpus)); // "% num_gpus" allows more CPU threads than GPU devices checkCudaErrors(cudaGetDevice(&gpu_id));
checkCudaErrors(cudaGetDevice(&gpu_id)); printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id, num_cpu_threads, gpu_id);
printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,
num_cpu_threads, gpu_id);
int *d_a = int *d_a = 0; // pointer to memory on the device associated with this CPU thread
0; // pointer to memory on the device associated with this CPU thread int *sub_a = a + cpu_thread_id * n / num_cpu_threads; // pointer to this CPU thread's portion of data
int *sub_a = unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
a + dim3 gpu_threads(128); // 128 threads per block
cpu_thread_id * n / dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
num_cpu_threads; // pointer to this CPU thread's portion of data
unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
dim3 gpu_threads(128); // 128 threads per block
dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel)); checkCudaErrors(cudaMalloc((void **)&d_a, nbytes_per_kernel));
checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel)); checkCudaErrors(cudaMemset(d_a, 0, nbytes_per_kernel));
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice));
cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice)); kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
kernelAddConstant<<<gpu_blocks, gpu_threads>>>(d_a, b);
checkCudaErrors( checkCudaErrors(cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost));
cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaFree(d_a));
checkCudaErrors(cudaFree(d_a)); }
} printf("---------------------------\n");
printf("---------------------------\n");
if (cudaSuccess != cudaGetLastError()) if (cudaSuccess != cudaGetLastError())
printf("%s\n", cudaGetErrorString(cudaGetLastError())); printf("%s\n", cudaGetErrorString(cudaGetLastError()));
//////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////
// check the result // check the result
// //
bool bResult = correctResult(a, n, b); bool bResult = correctResult(a, n, b);
if (a) free(a); // free CPU memory if (a)
free(a); // free CPU memory
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -25,191 +25,188 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include "cuda_fp16.h"
#include "helper_cuda.h"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
#include <ctime> #include <ctime>
#define NUM_OF_BLOCKS 128 #include "cuda_fp16.h"
#include "helper_cuda.h"
#define NUM_OF_BLOCKS 128
#define NUM_OF_THREADS 128 #define NUM_OF_THREADS 128
__forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v) { __forceinline__ __device__ void reduceInShared_intrinsics(half2 *const v)
if (threadIdx.x < 64) {
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]); if (threadIdx.x < 64)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 64]);
if (threadIdx.x < 32) __syncthreads();
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]); if (threadIdx.x < 32)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 32]);
if (threadIdx.x < 16) __syncthreads();
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]); if (threadIdx.x < 16)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 16]);
if (threadIdx.x < 8) __syncthreads();
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]); if (threadIdx.x < 8)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 8]);
if (threadIdx.x < 4) __syncthreads();
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]); if (threadIdx.x < 4)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 4]);
if (threadIdx.x < 2) __syncthreads();
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]); if (threadIdx.x < 2)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 2]);
if (threadIdx.x < 1) __syncthreads();
v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]); if (threadIdx.x < 1)
__syncthreads(); v[threadIdx.x] = __hadd2(v[threadIdx.x], v[threadIdx.x + 1]);
__syncthreads();
} }
__forceinline__ __device__ void reduceInShared_native(half2 *const v) { __forceinline__ __device__ void reduceInShared_native(half2 *const v)
if (threadIdx.x < 64) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64]; {
__syncthreads(); if (threadIdx.x < 64)
if (threadIdx.x < 32) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32]; v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 64];
__syncthreads(); __syncthreads();
if (threadIdx.x < 16) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16]; if (threadIdx.x < 32)
__syncthreads(); v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 32];
if (threadIdx.x < 8) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8]; __syncthreads();
__syncthreads(); if (threadIdx.x < 16)
if (threadIdx.x < 4) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4]; v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 16];
__syncthreads(); __syncthreads();
if (threadIdx.x < 2) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2]; if (threadIdx.x < 8)
__syncthreads(); v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 8];
if (threadIdx.x < 1) v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1]; __syncthreads();
__syncthreads(); if (threadIdx.x < 4)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 4];
__syncthreads();
if (threadIdx.x < 2)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 2];
__syncthreads();
if (threadIdx.x < 1)
v[threadIdx.x] = v[threadIdx.x] + v[threadIdx.x + 1];
__syncthreads();
} }
__global__ void scalarProductKernel_intrinsics(half2 const *const a, __global__ void
half2 const *const b, scalarProductKernel_intrinsics(half2 const *const a, half2 const *const b, float *const results, size_t const size)
float *const results, {
size_t const size) { const int stride = gridDim.x * blockDim.x;
const int stride = gridDim.x * blockDim.x; __shared__ half2 shArray[NUM_OF_THREADS];
__shared__ half2 shArray[NUM_OF_THREADS];
shArray[threadIdx.x] = __float2half2_rn(0.f); shArray[threadIdx.x] = __float2half2_rn(0.f);
half2 value = __float2half2_rn(0.f); half2 value = __float2half2_rn(0.f);
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) { for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = __hfma2(a[i], b[i], value); value = __hfma2(a[i], b[i], value);
} }
shArray[threadIdx.x] = value; shArray[threadIdx.x] = value;
__syncthreads(); __syncthreads();
reduceInShared_intrinsics(shArray); reduceInShared_intrinsics(shArray);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
half2 result = shArray[0]; half2 result = shArray[0];
float f_result = __low2float(result) + __high2float(result); float f_result = __low2float(result) + __high2float(result);
results[blockIdx.x] = f_result; results[blockIdx.x] = f_result;
} }
} }
__global__ void scalarProductKernel_native(half2 const *const a, __global__ void
half2 const *const b, scalarProductKernel_native(half2 const *const a, half2 const *const b, float *const results, size_t const size)
float *const results, {
size_t const size) { const int stride = gridDim.x * blockDim.x;
const int stride = gridDim.x * blockDim.x; __shared__ half2 shArray[NUM_OF_THREADS];
__shared__ half2 shArray[NUM_OF_THREADS];
half2 value(0.f, 0.f); half2 value(0.f, 0.f);
shArray[threadIdx.x] = value; shArray[threadIdx.x] = value;
for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) { for (int i = threadIdx.x + blockDim.x + blockIdx.x; i < size; i += stride) {
value = a[i] * b[i] + value; value = a[i] * b[i] + value;
} }
shArray[threadIdx.x] = value; shArray[threadIdx.x] = value;
__syncthreads(); __syncthreads();
reduceInShared_native(shArray); reduceInShared_native(shArray);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
half2 result = shArray[0]; half2 result = shArray[0];
float f_result = (float)result.y + (float)result.x; float f_result = (float)result.y + (float)result.x;
results[blockIdx.x] = f_result; results[blockIdx.x] = f_result;
} }
} }
void generateInput(half2 *a, size_t size) { void generateInput(half2 *a, size_t size)
for (size_t i = 0; i < size; ++i) { {
half2 temp; for (size_t i = 0; i < size; ++i) {
temp.x = static_cast<float>(rand() % 4); half2 temp;
temp.y = static_cast<float>(rand() % 2); temp.x = static_cast<float>(rand() % 4);
a[i] = temp; temp.y = static_cast<float>(rand() % 2);
} a[i] = temp;
}
} }
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
srand((unsigned int)time(NULL)); {
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16; srand((unsigned int)time(NULL));
size_t size = NUM_OF_BLOCKS * NUM_OF_THREADS * 16;
half2 *vec[2]; half2 *vec[2];
half2 *devVec[2]; half2 *devVec[2];
float *results; float *results;
float *devResults; float *devResults;
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
cudaDeviceProp devProp; cudaDeviceProp devProp;
checkCudaErrors(cudaGetDeviceProperties(&devProp, devID)); checkCudaErrors(cudaGetDeviceProperties(&devProp, devID));
if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) { if (devProp.major < 5 || (devProp.major == 5 && devProp.minor < 3)) {
printf( printf("ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or "
"ERROR: fp16ScalarProduct requires GPU devices with compute SM 5.3 or " "higher.\n");
"higher.\n"); return EXIT_WAIVED;
return EXIT_WAIVED; }
}
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i])); checkCudaErrors(cudaMallocHost((void **)&vec[i], size * sizeof *vec[i]));
checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i])); checkCudaErrors(cudaMalloc((void **)&devVec[i], size * sizeof *devVec[i]));
} }
checkCudaErrors( checkCudaErrors(cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results));
cudaMallocHost((void **)&results, NUM_OF_BLOCKS * sizeof *results)); checkCudaErrors(cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
checkCudaErrors(
cudaMalloc((void **)&devResults, NUM_OF_BLOCKS * sizeof *devResults));
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
generateInput(vec[i], size); generateInput(vec[i], size);
checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], checkCudaErrors(cudaMemcpy(devVec[i], vec[i], size * sizeof *vec[i], cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice)); }
}
scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>( scalarProductKernel_native<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults, checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
float result_native = 0; float result_native = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) { for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_native += results[i]; result_native += results[i];
} }
printf("Result native operators\t: %f \n", result_native); printf("Result native operators\t: %f \n", result_native);
scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>( scalarProductKernel_intrinsics<<<NUM_OF_BLOCKS, NUM_OF_THREADS>>>(devVec[0], devVec[1], devResults, size);
devVec[0], devVec[1], devResults, size);
checkCudaErrors(cudaMemcpy(results, devResults, checkCudaErrors(cudaMemcpy(results, devResults, NUM_OF_BLOCKS * sizeof *results, cudaMemcpyDeviceToHost));
NUM_OF_BLOCKS * sizeof *results,
cudaMemcpyDeviceToHost));
float result_intrinsics = 0; float result_intrinsics = 0;
for (int i = 0; i < NUM_OF_BLOCKS; ++i) { for (int i = 0; i < NUM_OF_BLOCKS; ++i) {
result_intrinsics += results[i]; result_intrinsics += results[i];
} }
printf("Result intrinsics\t: %f \n", result_intrinsics); printf("Result intrinsics\t: %f \n", result_intrinsics);
printf("&&&& fp16ScalarProduct %s\n", printf("&&&& fp16ScalarProduct %s\n", (fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED" : "FAILED");
(fabs(result_intrinsics - result_native) < 0.00001) ? "PASSED"
: "FAILED");
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
checkCudaErrors(cudaFree(devVec[i])); checkCudaErrors(cudaFree(devVec[i]));
checkCudaErrors(cudaFreeHost(vec[i])); checkCudaErrors(cudaFreeHost(vec[i]));
} }
checkCudaErrors(cudaFree(devResults)); checkCudaErrors(cudaFree(devResults));
checkCudaErrors(cudaFreeHost(results)); checkCudaErrors(cudaFreeHost(results));
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -40,314 +40,303 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h>
#include <cuda_profiler_api.h> #include <cuda_profiler_api.h>
#include <cuda_runtime.h>
// Helper functions and utilities to work with CUDA // Helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
/** /**
* Matrix multiplication (CUDA Kernel) on the device: C = A * B * Matrix multiplication (CUDA Kernel) on the device: C = A * B
* wA is A's width and wB is B's width * wA is A's width and wB is B's width
*/ */
template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, template <int BLOCK_SIZE> __global__ void MatrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
float *B, int wA, {
int wB) { // Block index
// Block index int bx = blockIdx.x;
int bx = blockIdx.x; int by = blockIdx.y;
int by = blockIdx.y;
// Thread index // Thread index
int tx = threadIdx.x; int tx = threadIdx.x;
int ty = threadIdx.y; int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block // Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by; int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block // Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1; int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A // Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE; int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block // Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx; int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B // Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB; int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix // Csub is used to store the element of the block sub-matrix
// that is computed by the thread // that is computed by the thread
float Csub = 0; float Csub = 0;
// Loop over all the sub-matrices of A and B // Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix // required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
a <= aEnd; // Declaration of the shared memory array As used to
a += aStep, b += bStep) { // store the sub-matrix of A
// Declaration of the shared memory array As used to __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to // Declaration of the shared memory array Bs used to
// store the sub-matrix of B // store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory // Load the matrices from device memory
// to shared memory; each thread loads // to shared memory; each thread loads
// one element of each matrix // one element of each matrix
As[ty][tx] = A[a + wA * ty + tx]; As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx]; Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded // Synchronize to make sure the matrices are loaded
__syncthreads(); __syncthreads();
// Multiply the two matrices together; // Multiply the two matrices together;
// each thread computes one element // each thread computes one element
// of the block sub-matrix // of the block sub-matrix
#pragma unroll #pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) { for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx]; Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
} }
// Synchronize to make sure that the preceding // Write the block sub-matrix to device memory;
// computation is done before loading two new // each thread writes one element
// sub-matrices of A and B in the next iteration int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
__syncthreads(); C[c + wB * ty + tx] = Csub;
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
} }
void ConstantInit(float *data, int size, float val) { void ConstantInit(float *data, int size, float val)
for (int i = 0; i < size; ++i) { {
data[i] = val; for (int i = 0; i < size; ++i) {
} data[i] = val;
}
} }
/** /**
* Run a simple test of matrix multiplication using CUDA * Run a simple test of matrix multiplication using CUDA
*/ */
int MatrixMultiply(int argc, char **argv, int MatrixMultiply(int argc, char **argv, int block_size, const dim3 &dimsA, const dim3 &dimsB)
int block_size, const dim3 &dimsA, {
const dim3 &dimsB) { // Allocate host memory for matrices A and B
// Allocate host memory for matrices A and B unsigned int size_A = dimsA.x * dimsA.y;
unsigned int size_A = dimsA.x * dimsA.y; unsigned int mem_size_A = sizeof(float) * size_A;
unsigned int mem_size_A = sizeof(float) * size_A; float *h_A;
float *h_A; checkCudaErrors(cudaMallocHost(&h_A, mem_size_A));
checkCudaErrors(cudaMallocHost(&h_A, mem_size_A)); unsigned int size_B = dimsB.x * dimsB.y;
unsigned int size_B = dimsB.x * dimsB.y; unsigned int mem_size_B = sizeof(float) * size_B;
unsigned int mem_size_B = sizeof(float) * size_B; float *h_B;
float *h_B; checkCudaErrors(cudaMallocHost(&h_B, mem_size_B));
checkCudaErrors(cudaMallocHost(&h_B, mem_size_B)); cudaStream_t stream;
cudaStream_t stream;
// Initialize host memory // Initialize host memory
const float valB = 0.01f; const float valB = 0.01f;
ConstantInit(h_A, size_A, 1.0f); ConstantInit(h_A, size_A, 1.0f);
ConstantInit(h_B, size_B, valB); ConstantInit(h_B, size_B, valB);
// Allocate device memory // Allocate device memory
float *d_A, *d_B, *d_C; float *d_A, *d_B, *d_C;
// Allocate host matrix C // Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1); dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C; float *h_C;
checkCudaErrors(cudaMallocHost(&h_C, mem_size_C)); checkCudaErrors(cudaMallocHost(&h_C, mem_size_C));
if (h_C == NULL) { if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n"); fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A)); checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B)); checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C)); checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
// Allocate CUDA events that we'll use for timing // Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop; cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop)); checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// copy host memory to device // copy host memory to device
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream)); checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
// Setup execution parameters // Setup execution parameters
dim3 threads(block_size, block_size); dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer // Create and start timer
printf("Computing result using CUDA Kernel...\n"); printf("Computing result using CUDA Kernel...\n");
// Performs warmup operation using matrixMul CUDA kernel // Performs warmup operation using matrixMul CUDA kernel
if (block_size == 16) {
MatrixMulCUDA<16>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} else {
MatrixMulCUDA<32>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
printf("done\n");
checkCudaErrors(cudaStreamSynchronize(stream));
// Record the start event
checkCudaErrors(cudaEventRecord(start, stream));
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
if (block_size == 16) { if (block_size == 16) {
MatrixMulCUDA<16> MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} else {
MatrixMulCUDA<32>
<<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
} }
} else {
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x) *
static_cast<double>(dimsA.y) *
static_cast<double>(dimsB.x);
double gigaFlops =
(flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf(
"Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops, msecPerMatrixMul, flopsPerMatrixMul, threads.x * threads.y);
// Copy result from device to host
checkCudaErrors(
cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
i, h_C[i], dimsA.x * valB, eps);
correct = false;
} }
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); printf("done\n");
checkCudaErrors(cudaStreamSynchronize(stream));
// Clean up memory // Record the start event
checkCudaErrors(cudaFreeHost(h_A)); checkCudaErrors(cudaEventRecord(start, stream));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
printf(
"\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n");
if (correct) { // Execute the kernel
return EXIT_SUCCESS; int nIter = 300;
} else {
return EXIT_FAILURE; for (int j = 0; j < nIter; j++) {
} if (block_size == 16) {
MatrixMulCUDA<16><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
else {
MatrixMulCUDA<32><<<grid, threads, 0, stream>>>(d_C, d_A, d_B, dimsA.x, dimsB.x);
}
}
// Record the stop event
checkCudaErrors(cudaEventRecord(stop, stream));
// Wait for the stop event to complete
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
// Compute and print the performance
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul =
2.0 * static_cast<double>(dimsA.x) * static_cast<double>(dimsA.y) * static_cast<double>(dimsB.x);
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops,"
" WorkgroupSize= %u threads/block\n",
gigaFlops,
msecPerMatrixMul,
flopsPerMatrixMul,
threads.x * threads.y);
// Copy result from device to host
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
// Clean up memory
checkCudaErrors(cudaFreeHost(h_A));
checkCudaErrors(cudaFreeHost(h_B));
checkCudaErrors(cudaFreeHost(h_C));
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
printf("\nNOTE: The CUDA Samples are not meant for performance "
"measurements. Results may vary when GPU Boost is enabled.\n");
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
} }
/** /**
* Program main * Program main
*/ */
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("[Matrix Multiply Using CUDA] - Starting...\n"); {
printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
checkCmdLineFlag(argc, (const char **)argv, "?")) { printf("Usage -device=n (n >= 0 for deviceID)\n");
printf("Usage -device=n (n >= 0 for deviceID)\n"); printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); printf(" Note: Outer matrix dimensions of A & B matrices"
printf(" Note: Outer matrix dimensions of A & B matrices" \ " must be equal.\n");
" must be equal.\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
// This will pick the best possible CUDA capable device, otherwise // This will pick the best possible CUDA capable device, otherwise
// override the device ID based on input provided at the command line // override the device ID based on input provided at the command line
int dev = findCudaDevice(argc, (const char **)argv); int dev = findCudaDevice(argc, (const char **)argv);
int block_size = 32; int block_size = 32;
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// width of Matrix A // width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
} }
// height of Matrix A // height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
} }
// width of Matrix B // width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
} }
// height of Matrix B // height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
} }
if (dimsA.x != dimsB.y) { if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
dimsA.x, dimsB.y); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
dimsB.x, dimsB.y);
checkCudaErrors(cudaProfilerStart()); checkCudaErrors(cudaProfilerStart());
int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB); int matrix_result = MatrixMultiply(argc, argv, block_size, dimsA, dimsB);
checkCudaErrors(cudaProfilerStop()); checkCudaErrors(cudaProfilerStop());
exit(matrix_result); exit(matrix_result);
} }

View File

@ -30,11 +30,11 @@
// Matrix dimensions // Matrix dimensions
// (chosen as multiples of the thread block size for simplicity) // (chosen as multiples of the thread block size for simplicity)
#define WA (4 * block_size) // Matrix A width #define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height #define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width #define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height #define HB WA // Matrix B height
#define WC WB // Matrix C width #define WC WB // Matrix C width
#define HC HA // Matrix C height #define HC HA // Matrix C height
#endif // _MATRIXMUL_H_ #endif // _MATRIXMUL_H_

View File

@ -46,23 +46,23 @@
// includes, system // includes, system
#include <builtin_types.h> #include <builtin_types.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring> #include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, project, CUDA // includes, project, CUDA
#include <cstring>
#include <cuda.h> #include <cuda.h>
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
#include <helper_image.h> #include <helper_image.h>
#include <helper_string.h> #include <helper_string.h>
#include <helper_timer.h> #include <helper_timer.h>
#include <cstring>
#include <iostream> #include <iostream>
#include <string> #include <string>
#include "matrixMul.h" #include "matrixMul.h"
@ -71,11 +71,9 @@
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
void randomInit(float *, int); void randomInit(float *, int);
extern "C" void computeGold(float *, const float *, const float *, unsigned int, extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
unsigned int, unsigned int);
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size);
int *blk_size);
#ifndef FATBIN_FILE #ifndef FATBIN_FILE
#define FATBIN_FILE "matrixMul_kernel64.fatbin" #define FATBIN_FILE "matrixMul_kernel64.fatbin"
@ -84,237 +82,252 @@ static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Globals // Globals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
CUdevice cuDevice; CUdevice cuDevice;
CUcontext cuContext; CUcontext cuContext;
CUmodule cuModule; CUmodule cuModule;
size_t totalGlobalMem; size_t totalGlobalMem;
const char *sSDKsample = "matrixMulDrv (Driver API)"; const char *sSDKsample = "matrixMulDrv (Driver API)";
void constantInit(float *data, int size, float val) { void constantInit(float *data, int size, float val)
for (int i = 0; i < size; ++i) { {
data[i] = val; for (int i = 0; i < size; ++i) {
} data[i] = val;
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("[ %s ]\n", sSDKsample); {
printf("[ %s ]\n", sSDKsample);
runTest(argc, argv); runTest(argc, argv);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
// initialize CUDA {
CUfunction matrixMul = NULL; // initialize CUDA
int block_size = 0; CUfunction matrixMul = NULL;
int block_size = 0;
initCUDA(argc, argv, &matrixMul, &block_size); initCUDA(argc, argv, &matrixMul, &block_size);
// set seed for rand() // set seed for rand()
srand(2006); srand(2006);
// allocate host memory for matrices A and B // allocate host memory for matrices A and B
unsigned int size_A = WA * HA; unsigned int size_A = WA * HA;
unsigned int mem_size_A = sizeof(float) * size_A; unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A)); float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = WB * HB; unsigned int size_B = WB * HB;
unsigned int mem_size_B = sizeof(float) * size_B; unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B)); float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
// initialize host memory // initialize host memory
const float valB = 0.01f; const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f); constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB); constantInit(h_B, size_B, valB);
// allocate device memory // allocate device memory
CUdeviceptr d_A; CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A)); checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
CUdeviceptr d_B; CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B)); checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
// copy host memory to device // copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A)); checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result // allocate device memory for result
size_t size_C = WC * HC; size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C; size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C; CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side // allocate mem for the result on host side
float *h_C = reinterpret_cast<float *>(malloc(mem_size_C)); float *h_C = reinterpret_cast<float *>(malloc(mem_size_C));
// create and start timer // create and start timer
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
// start the timer // start the timer
sdkStartTimer(&timer); sdkStartTimer(&timer);
// There are two ways to launch CUDA kernels via the Driver API. // There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters // In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method. // and specify parameters. By default we use the simpler method.
dim3 block(block_size, block_size, 1); dim3 block(block_size, block_size, 1);
dim3 grid(WC / block_size, HC / block_size, 1); dim3 grid(WC / block_size, HC / block_size, 1);
if (1) { if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simplier method) // Launching (simplier method)
size_t Matrix_Width_A = (size_t)WA; size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB; size_t Matrix_Width_B = (size_t)WB;
void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B}; void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
// new CUDA 4.0 Driver API Kernel launch call // new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(matrixMul,
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z, grid.x,
2 * block_size * block_size * sizeof(float), NULL, args, NULL)); grid.y,
} else { grid.z,
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel block.x,
// Launching (advanced method) block.y,
int offset = 0; block.z,
char argBuffer[256]; 2 * block_size * block_size * sizeof(float),
NULL,
// pass in launch parameters (not actually de-referencing CUdeviceptr). args,
// CUdeviceptr is storing the value of the parameters NULL));
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
size_t Matrix_Width_A = (size_t)WA;
size_t Matrix_Width_B = (size_t)WB;
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
offset += sizeof(Matrix_Width_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
offset += sizeof(Matrix_Width_B);
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset,
CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(
matrixMul, grid.x, grid.y, grid.z, block.x, block.y, block.z,
2 * block_size * block_size * sizeof(float), NULL, NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i,
h_C[i], WA * valB);
correct = false;
} }
} else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); // pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is storing the value of the parameters
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_C;
offset += sizeof(d_C);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_A;
offset += sizeof(d_A);
*(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = d_B;
offset += sizeof(d_B);
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. " size_t Matrix_Width_A = (size_t)WA;
"Results may vary when GPU Boost is enabled.\n"); size_t Matrix_Width_B = (size_t)WB;
// clean up memory *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_A;
free(h_A); offset += sizeof(Matrix_Width_A);
free(h_B); *(reinterpret_cast<CUdeviceptr *>(&argBuffer[offset])) = Matrix_Width_B;
free(h_C); offset += sizeof(Matrix_Width_B);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B)); void *kernel_launch_config[5] = {
checkCudaErrors(cuMemFree(d_C)); CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
checkCudaErrors(cuCtxDestroy(cuContext));
// new CUDA 4.0 Driver API Kernel launch call
checkCudaErrors(cuLaunchKernel(matrixMul,
grid.x,
grid.y,
grid.z,
block.x,
block.y,
block.z,
2 * block_size * block_size * sizeof(float),
NULL,
NULL,
reinterpret_cast<void **>(&kernel_launch_config)));
}
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(reinterpret_cast<void *>(h_C), d_C, mem_size_C));
// stop and destroy timer
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
printf("Checking computed result for correctness: ");
bool correct = true;
for (int i = 0; i < static_cast<int>(WC * HC); i++) {
if (fabs(h_C[i] - (WA * valB)) > 1e-5) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > 1e-5\n", i, h_C[i], WA * valB);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(cuContext));
} }
// Allocates a matrix with random float entries. // Allocates a matrix with random float entries.
void randomInit(float *data, int size) { void randomInit(float *data, int size)
for (int i = 0; i < size; ++i) { {
data[i] = rand() / static_cast<float>(RAND_MAX); for (int i = 0; i < size; ++i) {
} data[i] = rand() / static_cast<float>(RAND_MAX);
}
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul,
int *blk_size) {
CUfunction cuFunction = 0;
int major = 0, minor = 0;
char deviceName[100];
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n",
(long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
std::string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE);
} else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit",
"matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0,
2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
} else {
block_size /= 2;
} }
idx++; }
}
static int initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *blk_size)
*pMatrixMul = cuFunction; {
*blk_size = block_size; CUfunction cuFunction = 0;
int major = 0, minor = 0;
return 0; char deviceName[100];
cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, cuDevice));
printf(" Total amount of global memory: %llu bytes\n", (long long unsigned int)totalGlobalMem);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results
std::string module_path;
std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE);
}
else {
printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// select the suitable kernel function
const char *kernels[] = {"matrixMul_bs32_64bit", "matrixMul_bs16_64bit", "matrixMul_bs8_64bit"};
int idx = 0;
int block_size = 32;
while (idx < 3) {
int threadsPerBlock = 0;
int blocksPerGrid = 0;
checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, kernels[idx]));
checkCudaErrors(cuOccupancyMaxPotentialBlockSize(
&blocksPerGrid, &threadsPerBlock, cuFunction, 0, 2 * block_size * block_size * sizeof(float), 0));
if (block_size * block_size <= threadsPerBlock) {
printf("> %d block size selected\n", block_size);
break;
}
else {
block_size /= 2;
}
idx++;
}
*pMatrixMul = cuFunction;
*blk_size = block_size;
return 0;
} }

View File

@ -42,86 +42,87 @@
//! wA is A's width and wB is B's width //! wA is A's width and wB is B's width
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <int block_size, typename size_type> template <int block_size, typename size_type>
__device__ void matrixMul(float *C, float *A, float *B, size_type wA, __device__ void matrixMul(float *C, float *A, float *B, size_type wA, size_type wB)
size_type wB) { {
// Block index // Block index
size_type bx = blockIdx.x; size_type bx = blockIdx.x;
size_type by = blockIdx.y; size_type by = blockIdx.y;
// Thread index // Thread index
size_type tx = threadIdx.x; size_type tx = threadIdx.x;
size_type ty = threadIdx.y; size_type ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block // Index of the first sub-matrix of A processed by the block
size_type aBegin = wA * block_size * by; size_type aBegin = wA * block_size * by;
// Index of the last sub-matrix of A processed by the block // Index of the last sub-matrix of A processed by the block
size_type aEnd = aBegin + wA - 1; size_type aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A // Step size used to iterate through the sub-matrices of A
size_type aStep = block_size; size_type aStep = block_size;
// Index of the first sub-matrix of B processed by the block // Index of the first sub-matrix of B processed by the block
size_type bBegin = block_size * bx; size_type bBegin = block_size * bx;
// Step size used to iterate through the sub-matrices of B // Step size used to iterate through the sub-matrices of B
size_type bStep = block_size * wB; size_type bStep = block_size * wB;
// Csub is used to store the element of the block sub-matrix // Csub is used to store the element of the block sub-matrix
// that is computed by the thread // that is computed by the thread
float Csub = 0; float Csub = 0;
// Loop over all the sub-matrices of A and B // Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix // required to compute the block sub-matrix
for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { for (size_type a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to // Declaration of the shared memory array As used to
// store the sub-matrix of A // store the sub-matrix of A
__shared__ float As[block_size][block_size]; __shared__ float As[block_size][block_size];
// Declaration of the shared memory array Bs used to // Declaration of the shared memory array Bs used to
// store the sub-matrix of B // store the sub-matrix of B
__shared__ float Bs[block_size][block_size]; __shared__ float Bs[block_size][block_size];
// Load the matrices from device memory // Load the matrices from device memory
// to shared memory; each thread loads // to shared memory; each thread loads
// one element of each matrix // one element of each matrix
AS(ty, tx) = A[a + wA * ty + tx]; AS(ty, tx) = A[a + wA * ty + tx];
BS(ty, tx) = B[b + wB * ty + tx]; BS(ty, tx) = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded // Synchronize to make sure the matrices are loaded
__syncthreads(); __syncthreads();
// Multiply the two matrices together; // Multiply the two matrices together;
// each thread computes one element // each thread computes one element
// of the block sub-matrix // of the block sub-matrix
#pragma unroll #pragma unroll
for (size_type k = 0; k < block_size; ++k) Csub += AS(ty, k) * BS(k, tx); for (size_type k = 0; k < block_size; ++k)
Csub += AS(ty, k) * BS(k, tx);
// Synchronize to make sure that the preceding // Synchronize to make sure that the preceding
// computation is done before loading two new // computation is done before loading two new
// sub-matrices of A and B in the next iteration // sub-matrices of A and B in the next iteration
__syncthreads(); __syncthreads();
} }
// Write the block sub-matrix to device memory; // Write the block sub-matrix to device memory;
// each thread writes one element // each thread writes one element
size_type c = wB * block_size * by + block_size * bx; size_type c = wB * block_size * by + block_size * bx;
C[c + wB * ty + tx] = Csub; C[c + wB * ty + tx] = Csub;
} }
// C wrappers around our template kernel // C wrappers around our template kernel
extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, extern "C" __global__ void matrixMul_bs8_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
size_t wA, size_t wB) { {
matrixMul<8, size_t>(C, A, B, wA, wB); matrixMul<8, size_t>(C, A, B, wA, wB);
} }
extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, extern "C" __global__ void matrixMul_bs16_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
size_t wA, size_t wB) { {
matrixMul<16, size_t>(C, A, B, wA, wB); matrixMul<16, size_t>(C, A, B, wA, wB);
} }
extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, extern "C" __global__ void matrixMul_bs32_64bit(float *C, float *A, float *B, size_t wA, size_t wB)
size_t wA, size_t wB) { {
matrixMul<32, size_t>(C, A, B, wA, wB); matrixMul<32, size_t>(C, A, B, wA, wB);
} }
#endif // #ifndef _MATRIXMUL_KERNEL_H_ #endif // #ifndef _MATRIXMUL_KERNEL_H_

View File

@ -15,210 +15,211 @@
// With these flags defined, this source file will dynamically // With these flags defined, this source file will dynamically
// load the corresponding functions. Disabled by default. // load the corresponding functions. Disabled by default.
//#define CUDA_INIT_D3D9 // #define CUDA_INIT_D3D9
//#define CUDA_INIT_D3D10 // #define CUDA_INIT_D3D10
//#define CUDA_INIT_D3D11 // #define CUDA_INIT_D3D11
//#define CUDA_INIT_OPENGL // #define CUDA_INIT_OPENGL
#include <stdio.h>
#include "cuda_drvapi_dynlink.h" #include "cuda_drvapi_dynlink.h"
tcuInit *_cuInit; #include <stdio.h>
tcuDriverGetVersion *cuDriverGetVersion;
tcuDeviceGet *cuDeviceGet;
tcuDeviceGetCount *cuDeviceGetCount;
tcuDeviceGetName *cuDeviceGetName;
tcuDeviceComputeCapability *cuDeviceComputeCapability;
tcuDeviceTotalMem *cuDeviceTotalMem;
tcuDeviceGetProperties *cuDeviceGetProperties;
tcuDeviceGetAttribute *cuDeviceGetAttribute;
tcuGetErrorString *cuGetErrorString;
tcuCtxCreate *cuCtxCreate;
tcuCtxDestroy *cuCtxDestroy;
tcuCtxAttach *cuCtxAttach;
tcuCtxDetach *cuCtxDetach;
tcuCtxPushCurrent *cuCtxPushCurrent;
tcuCtxPopCurrent *cuCtxPopCurrent;
tcuCtxGetCurrent *cuCtxGetCurrent;
tcuCtxSetCurrent *cuCtxSetCurrent;
tcuCtxGetDevice *cuCtxGetDevice;
tcuCtxSynchronize *cuCtxSynchronize;
tcuModuleLoad *cuModuleLoad;
tcuModuleLoadData *cuModuleLoadData;
tcuModuleLoadDataEx *cuModuleLoadDataEx;
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
tcuModuleUnload *cuModuleUnload;
tcuModuleGetFunction *cuModuleGetFunction;
tcuModuleGetGlobal *cuModuleGetGlobal;
tcuModuleGetTexRef *cuModuleGetTexRef;
tcuModuleGetSurfRef *cuModuleGetSurfRef;
tcuMemGetInfo *cuMemGetInfo;
tcuMemAlloc *cuMemAlloc;
tcuMemAllocPitch *cuMemAllocPitch;
tcuMemFree *cuMemFree;
tcuMemGetAddressRange *cuMemGetAddressRange;
tcuMemAllocHost *cuMemAllocHost;
tcuMemFreeHost *cuMemFreeHost;
tcuMemHostAlloc *cuMemHostAlloc;
tcuMemHostGetFlags *cuMemHostGetFlags;
tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer; tcuInit *_cuInit;
tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId; tcuDriverGetVersion *cuDriverGetVersion;
tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId; tcuDeviceGet *cuDeviceGet;
tcuIpcGetEventHandle *cuIpcGetEventHandle; tcuDeviceGetCount *cuDeviceGetCount;
tcuIpcOpenEventHandle *cuIpcOpenEventHandle; tcuDeviceGetName *cuDeviceGetName;
tcuIpcGetMemHandle *cuIpcGetMemHandle; tcuDeviceComputeCapability *cuDeviceComputeCapability;
tcuIpcOpenMemHandle *cuIpcOpenMemHandle; tcuDeviceTotalMem *cuDeviceTotalMem;
tcuIpcCloseMemHandle *cuIpcCloseMemHandle; tcuDeviceGetProperties *cuDeviceGetProperties;
tcuDeviceGetAttribute *cuDeviceGetAttribute;
tcuGetErrorString *cuGetErrorString;
tcuCtxCreate *cuCtxCreate;
tcuCtxDestroy *cuCtxDestroy;
tcuCtxAttach *cuCtxAttach;
tcuCtxDetach *cuCtxDetach;
tcuCtxPushCurrent *cuCtxPushCurrent;
tcuCtxPopCurrent *cuCtxPopCurrent;
tcuCtxGetCurrent *cuCtxGetCurrent;
tcuCtxSetCurrent *cuCtxSetCurrent;
tcuCtxGetDevice *cuCtxGetDevice;
tcuCtxSynchronize *cuCtxSynchronize;
tcuModuleLoad *cuModuleLoad;
tcuModuleLoadData *cuModuleLoadData;
tcuModuleLoadDataEx *cuModuleLoadDataEx;
tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
tcuModuleUnload *cuModuleUnload;
tcuModuleGetFunction *cuModuleGetFunction;
tcuModuleGetGlobal *cuModuleGetGlobal;
tcuModuleGetTexRef *cuModuleGetTexRef;
tcuModuleGetSurfRef *cuModuleGetSurfRef;
tcuMemGetInfo *cuMemGetInfo;
tcuMemAlloc *cuMemAlloc;
tcuMemAllocPitch *cuMemAllocPitch;
tcuMemFree *cuMemFree;
tcuMemGetAddressRange *cuMemGetAddressRange;
tcuMemAllocHost *cuMemAllocHost;
tcuMemFreeHost *cuMemFreeHost;
tcuMemHostAlloc *cuMemHostAlloc;
tcuMemHostGetFlags *cuMemHostGetFlags;
tcuMemHostRegister *cuMemHostRegister; tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
tcuMemHostUnregister *cuMemHostUnregister; tcuDeviceGetByPCIBusId *cuDeviceGetByPCIBusId;
tcuMemcpyHtoD *cuMemcpyHtoD; tcuDeviceGetPCIBusId *cuDeviceGetPCIBusId;
tcuMemcpyDtoH *cuMemcpyDtoH; tcuIpcGetEventHandle *cuIpcGetEventHandle;
tcuMemcpyDtoD *cuMemcpyDtoD; tcuIpcOpenEventHandle *cuIpcOpenEventHandle;
tcuMemcpyDtoA *cuMemcpyDtoA; tcuIpcGetMemHandle *cuIpcGetMemHandle;
tcuMemcpyAtoD *cuMemcpyAtoD; tcuIpcOpenMemHandle *cuIpcOpenMemHandle;
tcuMemcpyHtoA *cuMemcpyHtoA; tcuIpcCloseMemHandle *cuIpcCloseMemHandle;
tcuMemcpyAtoH *cuMemcpyAtoH;
tcuMemcpyAtoA *cuMemcpyAtoA;
tcuMemcpy2D *cuMemcpy2D;
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
tcuMemcpy3D *cuMemcpy3D;
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
tcuMemcpy2DAsync *cuMemcpy2DAsync;
tcuMemcpy3DAsync *cuMemcpy3DAsync;
tcuMemcpy *cuMemcpy;
tcuMemcpyPeer *cuMemcpyPeer;
tcuMemsetD8 *cuMemsetD8;
tcuMemsetD16 *cuMemsetD16;
tcuMemsetD32 *cuMemsetD32;
tcuMemsetD2D8 *cuMemsetD2D8;
tcuMemsetD2D16 *cuMemsetD2D16;
tcuMemsetD2D32 *cuMemsetD2D32;
tcuFuncSetBlockShape *cuFuncSetBlockShape;
tcuFuncSetSharedSize *cuFuncSetSharedSize;
tcuFuncGetAttribute *cuFuncGetAttribute;
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
tcuLaunchKernel *cuLaunchKernel;
tcuArrayCreate *cuArrayCreate;
tcuArrayGetDescriptor *cuArrayGetDescriptor;
tcuArrayDestroy *cuArrayDestroy;
tcuArray3DCreate *cuArray3DCreate;
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
tcuTexRefCreate *cuTexRefCreate;
tcuTexRefDestroy *cuTexRefDestroy;
tcuTexRefSetArray *cuTexRefSetArray;
tcuTexRefSetAddress *cuTexRefSetAddress;
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
tcuTexRefSetFormat *cuTexRefSetFormat;
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
tcuTexRefSetFlags *cuTexRefSetFlags;
tcuTexRefGetAddress *cuTexRefGetAddress;
tcuTexRefGetArray *cuTexRefGetArray;
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
tcuTexRefGetFormat *cuTexRefGetFormat;
tcuTexRefGetFlags *cuTexRefGetFlags;
tcuSurfRefSetArray *cuSurfRefSetArray;
tcuSurfRefGetArray *cuSurfRefGetArray;
tcuParamSetSize *cuParamSetSize;
tcuParamSeti *cuParamSeti;
tcuParamSetf *cuParamSetf;
tcuParamSetv *cuParamSetv;
tcuParamSetTexRef *cuParamSetTexRef;
tcuLaunch *cuLaunch;
tcuLaunchGrid *cuLaunchGrid;
tcuLaunchGridAsync *cuLaunchGridAsync;
tcuEventCreate *cuEventCreate;
tcuEventRecord *cuEventRecord;
tcuEventQuery *cuEventQuery;
tcuEventSynchronize *cuEventSynchronize;
tcuEventDestroy *cuEventDestroy;
tcuEventElapsedTime *cuEventElapsedTime;
tcuStreamCreate *cuStreamCreate;
tcuStreamWaitEvent *cuStreamWaitEvent;
tcuStreamAddCallback *cuStreamAddCallback;
tcuStreamQuery *cuStreamQuery;
tcuStreamSynchronize *cuStreamSynchronize;
tcuStreamDestroy *cuStreamDestroy;
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
tcuGraphicsMapResources *cuGraphicsMapResources;
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
tcuGetExportTable *cuGetExportTable;
tcuCtxSetLimit *cuCtxSetLimit;
tcuCtxGetLimit *cuCtxGetLimit;
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
tcuCtxGetApiVersion *cuCtxGetApiVersion;
tcuMipmappedArrayCreate *cuMipmappedArrayCreate; tcuMemHostRegister *cuMemHostRegister;
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel; tcuMemHostUnregister *cuMemHostUnregister;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy; tcuMemcpyHtoD *cuMemcpyHtoD;
tcuMemcpyDtoH *cuMemcpyDtoH;
tcuMemcpyDtoD *cuMemcpyDtoD;
tcuMemcpyDtoA *cuMemcpyDtoA;
tcuMemcpyAtoD *cuMemcpyAtoD;
tcuMemcpyHtoA *cuMemcpyHtoA;
tcuMemcpyAtoH *cuMemcpyAtoH;
tcuMemcpyAtoA *cuMemcpyAtoA;
tcuMemcpy2D *cuMemcpy2D;
tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
tcuMemcpy3D *cuMemcpy3D;
tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
tcuMemcpy2DAsync *cuMemcpy2DAsync;
tcuMemcpy3DAsync *cuMemcpy3DAsync;
tcuMemcpy *cuMemcpy;
tcuMemcpyPeer *cuMemcpyPeer;
tcuMemsetD8 *cuMemsetD8;
tcuMemsetD16 *cuMemsetD16;
tcuMemsetD32 *cuMemsetD32;
tcuMemsetD2D8 *cuMemsetD2D8;
tcuMemsetD2D16 *cuMemsetD2D16;
tcuMemsetD2D32 *cuMemsetD2D32;
tcuFuncSetBlockShape *cuFuncSetBlockShape;
tcuFuncSetSharedSize *cuFuncSetSharedSize;
tcuFuncGetAttribute *cuFuncGetAttribute;
tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
tcuFuncSetSharedMemConfig *cuFuncSetSharedMemConfig;
tcuLaunchKernel *cuLaunchKernel;
tcuArrayCreate *cuArrayCreate;
tcuArrayGetDescriptor *cuArrayGetDescriptor;
tcuArrayDestroy *cuArrayDestroy;
tcuArray3DCreate *cuArray3DCreate;
tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
tcuTexRefCreate *cuTexRefCreate;
tcuTexRefDestroy *cuTexRefDestroy;
tcuTexRefSetArray *cuTexRefSetArray;
tcuTexRefSetAddress *cuTexRefSetAddress;
tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
tcuTexRefSetFormat *cuTexRefSetFormat;
tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
tcuTexRefSetFlags *cuTexRefSetFlags;
tcuTexRefGetAddress *cuTexRefGetAddress;
tcuTexRefGetArray *cuTexRefGetArray;
tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
tcuTexRefGetFormat *cuTexRefGetFormat;
tcuTexRefGetFlags *cuTexRefGetFlags;
tcuSurfRefSetArray *cuSurfRefSetArray;
tcuSurfRefGetArray *cuSurfRefGetArray;
tcuParamSetSize *cuParamSetSize;
tcuParamSeti *cuParamSeti;
tcuParamSetf *cuParamSetf;
tcuParamSetv *cuParamSetv;
tcuParamSetTexRef *cuParamSetTexRef;
tcuLaunch *cuLaunch;
tcuLaunchGrid *cuLaunchGrid;
tcuLaunchGridAsync *cuLaunchGridAsync;
tcuEventCreate *cuEventCreate;
tcuEventRecord *cuEventRecord;
tcuEventQuery *cuEventQuery;
tcuEventSynchronize *cuEventSynchronize;
tcuEventDestroy *cuEventDestroy;
tcuEventElapsedTime *cuEventElapsedTime;
tcuStreamCreate *cuStreamCreate;
tcuStreamWaitEvent *cuStreamWaitEvent;
tcuStreamAddCallback *cuStreamAddCallback;
tcuStreamQuery *cuStreamQuery;
tcuStreamSynchronize *cuStreamSynchronize;
tcuStreamDestroy *cuStreamDestroy;
tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
tcuGraphicsMapResources *cuGraphicsMapResources;
tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
tcuGetExportTable *cuGetExportTable;
tcuCtxSetLimit *cuCtxSetLimit;
tcuCtxGetLimit *cuCtxGetLimit;
tcuCtxGetCacheConfig *cuCtxGetCacheConfig;
tcuCtxSetCacheConfig *cuCtxSetCacheConfig;
tcuCtxGetSharedMemConfig *cuCtxGetSharedMemConfig;
tcuCtxSetSharedMemConfig *cuCtxSetSharedMemConfig;
tcuCtxGetApiVersion *cuCtxGetApiVersion;
tcuProfilerStop *cuProfilerStop; tcuMipmappedArrayCreate *cuMipmappedArrayCreate;
tcuMipmappedArrayGetLevel *cuMipmappedArrayGetLevel;
tcuMipmappedArrayDestroy *cuMipmappedArrayDestroy;
tcuProfilerStop *cuProfilerStop;
#ifdef CUDA_INIT_D3D9 #ifdef CUDA_INIT_D3D9
// D3D9/CUDA interop (CUDA 1.x compatible API). These functions // D3D9/CUDA interop (CUDA 1.x compatible API). These functions
// are deprecated; please use the ones below // are deprecated; please use the ones below
tcuD3D9Begin *cuD3D9Begin; tcuD3D9Begin *cuD3D9Begin;
tcuD3D9End *cuD3DEnd; tcuD3D9End *cuD3DEnd;
tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer; tcuD3D9RegisterVertexBuffer *cuD3D9RegisterVertexBuffer;
tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer; tcuD3D9MapVertexBuffer *cuD3D9MapVertexBuffer;
tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer; tcuD3D9UnmapVertexBuffer *cuD3D9UnmapVertexBuffer;
tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer; tcuD3D9UnregisterVertexBuffer *cuD3D9UnregisterVertexBuffer;
// D3D9/CUDA interop (CUDA 2.x compatible) // D3D9/CUDA interop (CUDA 2.x compatible)
tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice; tcuD3D9GetDirect3DDevice *cuD3D9GetDirect3DDevice;
tcuD3D9RegisterResource *cuD3D9RegisterResource; tcuD3D9RegisterResource *cuD3D9RegisterResource;
tcuD3D9UnregisterResource *cuD3D9UnregisterResource; tcuD3D9UnregisterResource *cuD3D9UnregisterResource;
tcuD3D9MapResources *cuD3D9MapResources; tcuD3D9MapResources *cuD3D9MapResources;
tcuD3D9UnmapResources *cuD3D9UnmapResources; tcuD3D9UnmapResources *cuD3D9UnmapResources;
tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags; tcuD3D9ResourceSetMapFlags *cuD3D9ResourceSetMapFlags;
tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions; tcuD3D9ResourceGetSurfaceDimensions *cuD3D9ResourceGetSurfaceDimensions;
tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray; tcuD3D9ResourceGetMappedArray *cuD3D9ResourceGetMappedArray;
tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer; tcuD3D9ResourceGetMappedPointer *cuD3D9ResourceGetMappedPointer;
tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize; tcuD3D9ResourceGetMappedSize *cuD3D9ResourceGetMappedSize;
tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch; tcuD3D9ResourceGetMappedPitch *cuD3D9ResourceGetMappedPitch;
// D3D9/CUDA interop (CUDA 2.0+) // D3D9/CUDA interop (CUDA 2.0+)
tcuD3D9GetDevice *cuD3D9GetDevice; tcuD3D9GetDevice *cuD3D9GetDevice;
tcuD3D9CtxCreate *cuD3D9CtxCreate; tcuD3D9CtxCreate *cuD3D9CtxCreate;
tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource; tcuGraphicsD3D9RegisterResource *cuGraphicsD3D9RegisterResource;
#endif #endif
#ifdef CUDA_INIT_D3D10 #ifdef CUDA_INIT_D3D10
// D3D10/CUDA interop (CUDA 3.0+) // D3D10/CUDA interop (CUDA 3.0+)
tcuD3D10GetDevice *cuD3D10GetDevice; tcuD3D10GetDevice *cuD3D10GetDevice;
tcuD3D10CtxCreate *cuD3D10CtxCreate; tcuD3D10CtxCreate *cuD3D10CtxCreate;
tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource; tcuGraphicsD3D10RegisterResource *cuGraphicsD3D10RegisterResource;
#endif #endif
#ifdef CUDA_INIT_D3D11 #ifdef CUDA_INIT_D3D11
// D3D11/CUDA interop (CUDA 3.0+) // D3D11/CUDA interop (CUDA 3.0+)
tcuD3D11GetDevice *cuD3D11GetDevice; tcuD3D11GetDevice *cuD3D11GetDevice;
tcuD3D11CtxCreate *cuD3D11CtxCreate; tcuD3D11CtxCreate *cuD3D11CtxCreate;
tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource; tcuGraphicsD3D11RegisterResource *cuGraphicsD3D11RegisterResource;
#endif #endif
// GL/CUDA interop // GL/CUDA interop
#ifdef CUDA_INIT_OPENGL #ifdef CUDA_INIT_OPENGL
tcuGLCtxCreate *cuGLCtxCreate; tcuGLCtxCreate *cuGLCtxCreate;
tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer; tcuGraphicsGLRegisterBuffer *cuGraphicsGLRegisterBuffer;
tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage; tcuGraphicsGLRegisterImage *cuGraphicsGLRegisterImage;
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
tcuWGLGetDevice *cuWGLGetDevice; tcuWGLGetDevice *cuWGLGetDevice;
#endif #endif
#endif #endif
@ -239,8 +240,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{ {
*pInstance = LoadLibrary(__CudaLibName); *pInstance = LoadLibrary(__CudaLibName);
if (*pInstance == NULL) if (*pInstance == NULL) {
{
printf("LoadLibrary \"%s\" failed!\n", __CudaLibName); printf("LoadLibrary \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN; return CUDA_ERROR_UNKNOWN;
} }
@ -248,38 +248,35 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_SUCCESS; return CUDA_SUCCESS;
} }
#define GET_PROC_EX(name, alias, required) \ #define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \ alias = (t##name *)GetProcAddress(CudaDrvLib, #name); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
#name, __CudaLibName); \ return CUDA_ERROR_UNKNOWN; \
return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V2(name, alias, required) \ #define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2));\ alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
STRINGIFY(name##_v2), __CudaLibName); \ return CUDA_ERROR_UNKNOWN; \
return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V3(name, alias, required) \ #define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3));\ alias = (t##name *)GetProcAddress(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
STRINGIFY(name##_v3), __CudaLibName); \ return CUDA_ERROR_UNKNOWN; \
return CUDA_ERROR_UNKNOWN; \
} }
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX) #elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h> #include <dlfcn.h>
#if defined(__APPLE__) || defined(__MACOSX) #if defined(__APPLE__) || defined(__MACOSX)
static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib"; static char __CudaLibName[] = "/usr/local/cuda/lib/libcuda.dylib";
#elif defined(__ANDROID__) #elif defined(__ANDROID__)
#if defined (__aarch64__) #if defined(__aarch64__)
static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so"; static char __CudaLibName[] = "/system/vendor/lib64/libcuda.so";
#elif defined(__arm__) #elif defined(__arm__)
static char __CudaLibName[] = "/system/vendor/lib/libcuda.so"; static char __CudaLibName[] = "/system/vendor/lib/libcuda.so";
@ -294,8 +291,7 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
{ {
*pInstance = dlopen(__CudaLibName, RTLD_NOW); *pInstance = dlopen(__CudaLibName, RTLD_NOW);
if (*pInstance == NULL) if (*pInstance == NULL) {
{
printf("dlopen \"%s\" failed!\n", __CudaLibName); printf("dlopen \"%s\" failed!\n", __CudaLibName);
return CUDA_ERROR_UNKNOWN; return CUDA_ERROR_UNKNOWN;
} }
@ -303,52 +299,49 @@ static CUresult LOAD_LIBRARY(CUDADRIVER *pInstance)
return CUDA_SUCCESS; return CUDA_SUCCESS;
} }
#define GET_PROC_EX(name, alias, required) \ #define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, #name); \ alias = (t##name *)dlsym(CudaDrvLib, #name); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", #name, __CudaLibName); \
#name, __CudaLibName); \ return CUDA_ERROR_UNKNOWN; \
return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V2(name, alias, required) \ #define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \ alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v2), __CudaLibName); \
STRINGIFY(name##_v2), __CudaLibName); \ return CUDA_ERROR_UNKNOWN; \
return CUDA_ERROR_UNKNOWN; \
} }
#define GET_PROC_EX_V3(name, alias, required) \ #define GET_PROC_EX_V3(name, alias, required) \
alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \ alias = (t##name *)dlsym(CudaDrvLib, STRINGIFY(name##_v3)); \
if (alias == NULL && required) { \ if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \ printf("Failed to find required function \"%s\" in %s\n", STRINGIFY(name##_v3), __CudaLibName); \
STRINGIFY(name##_v3), __CudaLibName); \ return CUDA_ERROR_UNKNOWN; \
return CUDA_ERROR_UNKNOWN; \
} }
#else #else
#error unsupported platform #error unsupported platform
#endif #endif
#define CHECKED_CALL(call) \ #define CHECKED_CALL(call) \
do { \ do { \
CUresult result = (call); \ CUresult result = (call); \
if (CUDA_SUCCESS != result) { \ if (CUDA_SUCCESS != result) { \
return result; \ return result; \
} \ } \
} while(0) } while (0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1) #define GET_PROC_REQUIRED(name) GET_PROC_EX(name, name, 1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0) #define GET_PROC_OPTIONAL(name) GET_PROC_EX(name, name, 0)
#define GET_PROC(name) GET_PROC_REQUIRED(name) #define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1) #define GET_PROC_V2(name) GET_PROC_EX_V2(name, name, 1)
#define GET_PROC_V3(name) GET_PROC_EX_V3(name,name,1) #define GET_PROC_V3(name) GET_PROC_EX_V3(name, name, 1)
CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion) CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
{ {
CUDADRIVER CudaDrvLib; CUDADRIVER CudaDrvLib;
int driverVer = 1000; int driverVer = 1000;
CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib)); CHECKED_CALL(LOAD_LIBRARY(&CudaDrvLib));
@ -359,8 +352,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
// available since 2.2. if not present, version 1.0 is assumed // available since 2.2. if not present, version 1.0 is assumed
GET_PROC_OPTIONAL(cuDriverGetVersion); GET_PROC_OPTIONAL(cuDriverGetVersion);
if (cuDriverGetVersion) if (cuDriverGetVersion) {
{
CHECKED_CALL(cuDriverGetVersion(&driverVer)); CHECKED_CALL(cuDriverGetVersion(&driverVer));
} }
@ -428,24 +420,21 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuStreamDestroy); GET_PROC(cuStreamDestroy);
// These are CUDA 5.0 new functions // These are CUDA 5.0 new functions
if (driverVer >= 5000) if (driverVer >= 5000) {
{
GET_PROC(cuMipmappedArrayCreate); GET_PROC(cuMipmappedArrayCreate);
GET_PROC(cuMipmappedArrayDestroy); GET_PROC(cuMipmappedArrayDestroy);
GET_PROC(cuMipmappedArrayGetLevel); GET_PROC(cuMipmappedArrayGetLevel);
} }
// These are CUDA 4.2 new functions // These are CUDA 4.2 new functions
if (driverVer >= 4020) if (driverVer >= 4020) {
{
GET_PROC(cuFuncSetSharedMemConfig); GET_PROC(cuFuncSetSharedMemConfig);
GET_PROC(cuCtxGetSharedMemConfig); GET_PROC(cuCtxGetSharedMemConfig);
GET_PROC(cuCtxSetSharedMemConfig); GET_PROC(cuCtxSetSharedMemConfig);
} }
// These are CUDA 4.1 new functions // These are CUDA 4.1 new functions
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
{
GET_PROC(cuDeviceGetByPCIBusId); GET_PROC(cuDeviceGetByPCIBusId);
GET_PROC(cuDeviceGetPCIBusId); GET_PROC(cuDeviceGetPCIBusId);
GET_PROC(cuIpcGetEventHandle); GET_PROC(cuIpcGetEventHandle);
@ -456,8 +445,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
} }
// These could be _v2 interfaces // These could be _v2 interfaces
if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) if (cudaVersion >= 4000 && __CUDA_API_VERSION >= 4000) {
{
GET_PROC_V2(cuCtxDestroy); GET_PROC_V2(cuCtxDestroy);
GET_PROC_V2(cuCtxPopCurrent); GET_PROC_V2(cuCtxPopCurrent);
GET_PROC_V2(cuCtxPushCurrent); GET_PROC_V2(cuCtxPushCurrent);
@ -465,8 +453,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuEventDestroy); GET_PROC_V2(cuEventDestroy);
} }
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
{
GET_PROC_V2(cuDeviceTotalMem); GET_PROC_V2(cuDeviceTotalMem);
GET_PROC_V2(cuCtxCreate); GET_PROC_V2(cuCtxCreate);
GET_PROC_V2(cuModuleGetGlobal); GET_PROC_V2(cuModuleGetGlobal);
@ -507,17 +494,14 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC_V2(cuTexRefSetAddress); GET_PROC_V2(cuTexRefSetAddress);
GET_PROC_V2(cuTexRefGetAddress); GET_PROC_V2(cuTexRefGetAddress);
if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) if (cudaVersion >= 4010 && __CUDA_API_VERSION >= 4010) {
{
GET_PROC_V3(cuTexRefSetAddress2D); GET_PROC_V3(cuTexRefSetAddress2D);
} }
else else {
{
GET_PROC_V2(cuTexRefSetAddress2D); GET_PROC_V2(cuTexRefSetAddress2D);
} }
} }
else else {
{
// versions earlier than 3020 // versions earlier than 3020
GET_PROC(cuDeviceTotalMem); GET_PROC(cuDeviceTotalMem);
GET_PROC(cuCtxCreate); GET_PROC(cuCtxCreate);
@ -562,8 +546,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
} }
// The following functions are specific to CUDA versions // The following functions are specific to CUDA versions
if (driverVer >= 4000) if (driverVer >= 4000) {
{
GET_PROC(cuCtxSetCurrent); GET_PROC(cuCtxSetCurrent);
GET_PROC(cuCtxGetCurrent); GET_PROC(cuCtxGetCurrent);
GET_PROC(cuMemHostRegister); GET_PROC(cuMemHostRegister);
@ -574,8 +557,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuProfilerStop); GET_PROC(cuProfilerStop);
} }
if (driverVer >= 3010) if (driverVer >= 3010) {
{
GET_PROC(cuModuleGetSurfRef); GET_PROC(cuModuleGetSurfRef);
GET_PROC(cuSurfRefSetArray); GET_PROC(cuSurfRefSetArray);
GET_PROC(cuSurfRefGetArray); GET_PROC(cuSurfRefGetArray);
@ -583,8 +565,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuCtxGetLimit); GET_PROC(cuCtxGetLimit);
} }
if (driverVer >= 3000) if (driverVer >= 3000) {
{
GET_PROC(cuMemcpyDtoDAsync); GET_PROC(cuMemcpyDtoDAsync);
GET_PROC(cuFuncSetCacheConfig); GET_PROC(cuFuncSetCacheConfig);
#ifdef CUDA_INIT_D3D11 #ifdef CUDA_INIT_D3D11
@ -595,12 +576,10 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGraphicsUnregisterResource); GET_PROC(cuGraphicsUnregisterResource);
GET_PROC(cuGraphicsSubResourceGetMappedArray); GET_PROC(cuGraphicsSubResourceGetMappedArray);
if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) if (cudaVersion >= 3020 && __CUDA_API_VERSION >= 3020) {
{
GET_PROC_V2(cuGraphicsResourceGetMappedPointer); GET_PROC_V2(cuGraphicsResourceGetMappedPointer);
} }
else else {
{
GET_PROC(cuGraphicsResourceGetMappedPointer); GET_PROC(cuGraphicsResourceGetMappedPointer);
} }
@ -610,8 +589,7 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
GET_PROC(cuGetExportTable); GET_PROC(cuGetExportTable);
} }
if (driverVer >= 2030) if (driverVer >= 2030) {
{
GET_PROC(cuMemHostGetFlags); GET_PROC(cuMemHostGetFlags);
#ifdef CUDA_INIT_D3D10 #ifdef CUDA_INIT_D3D10
GET_PROC(cuD3D10GetDevice); GET_PROC(cuD3D10GetDevice);
@ -624,17 +602,16 @@ CUresult CUDAAPI cuInit(unsigned int Flags, int cudaVersion)
#endif #endif
} }
if (driverVer >= 2010) if (driverVer >= 2010) {
{
GET_PROC(cuModuleLoadDataEx); GET_PROC(cuModuleLoadDataEx);
GET_PROC(cuModuleLoadFatBinary); GET_PROC(cuModuleLoadFatBinary);
#ifdef CUDA_INIT_OPENGL #ifdef CUDA_INIT_OPENGL
GET_PROC(cuGLCtxCreate); GET_PROC(cuGLCtxCreate);
GET_PROC(cuGraphicsGLRegisterBuffer); GET_PROC(cuGraphicsGLRegisterBuffer);
GET_PROC(cuGraphicsGLRegisterImage); GET_PROC(cuGraphicsGLRegisterImage);
# ifdef WIN32 #ifdef WIN32
GET_PROC(cuWGLGetDevice); GET_PROC(cuWGLGetDevice);
# endif #endif
#endif #endif
#ifdef CUDA_INIT_D3D9 #ifdef CUDA_INIT_D3D9
GET_PROC(cuD3D9GetDevice); GET_PROC(cuD3D9GetDevice);

View File

@ -14,21 +14,17 @@
#ifndef HELPER_CUDA_DRVAPI_H #ifndef HELPER_CUDA_DRVAPI_H
#define HELPER_CUDA_DRVAPI_H #define HELPER_CUDA_DRVAPI_H
#include <helper_string.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <helper_string.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
#endif #endif
#ifndef HELPER_CUDA_DRVAPI_H #ifndef HELPER_CUDA_DRVAPI_H
inline int ftoi(float value) { inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
return (value >= 0 ? static_cast<int>(value + 0.5)
: static_cast<int>(value - 0.5));
}
#endif #endif
#ifndef EXIT_WAIVED #ifndef EXIT_WAIVED
@ -47,311 +43,302 @@ inline int ftoi(float value) {
#define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__) #define checkCudaErrors(err) __checkCudaErrors(err, __FILE__, __LINE__)
// These are the inline versions for all of the SDK helper functions // These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors(CUresult err, const char *file, const int line) { inline void __checkCudaErrors(CUresult err, const char *file, const int line)
if (CUDA_SUCCESS != err) { {
const char *errorStr = NULL; if (CUDA_SUCCESS != err) {
cuGetErrorString(err, &errorStr); const char *errorStr = NULL;
fprintf(stderr, cuGetErrorString(err, &errorStr);
"checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, " fprintf(stderr,
"line %i.\n", "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, "
err, errorStr, file, line); "line %i.\n",
exit(EXIT_FAILURE); err,
} errorStr,
file,
line);
exit(EXIT_FAILURE);
}
} }
#endif #endif
// This function wraps the CUDA Driver API into a template function // This function wraps the CUDA Driver API into a template function
template <class T> template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, {
int device) { checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
checkCudaErrors(cuDeviceGetAttribute(attribute, device_attribute, device));
} }
#endif #endif
// Beginning of GPU Architecture definitions // Beginning of GPU Architecture definitions
inline int _ConvertSMVer2CoresDRV(int major, int minor) { inline int _ConvertSMVer2CoresDRV(int major, int minor)
// Defines for GPU Architecture types (using the SM version to determine the # {
// of cores per SM // Defines for GPU Architecture types (using the SM version to determine the #
typedef struct { // of cores per SM
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM typedef struct
// minor version {
int Cores; int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM
} sSMtoCores; // minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = { sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192},
{0x30, 192}, {0x32, 192},
{0x32, 192}, {0x35, 192},
{0x35, 192}, {0x37, 192},
{0x37, 192}, {0x50, 128},
{0x50, 128}, {0x52, 128},
{0x52, 128}, {0x53, 128},
{0x53, 128}, {0x60, 64},
{0x60, 64}, {0x61, 128},
{0x61, 128}, {0x62, 128},
{0x62, 128}, {0x70, 64},
{0x70, 64}, {0x72, 64},
{0x72, 64}, {0x75, 64},
{0x75, 64}, {0x80, 64},
{0x80, 64}, {0x86, 128},
{0x86, 128}, {0x87, 128},
{0x87, 128}, {0x90, 128},
{0x90, 128}, {-1, -1}};
{-1, -1}};
int index = 0; int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) { while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores; return nGpuArchCoresPerSM[index].Cores;
}
index++;
} }
index++; // If we don't find the values, we default use the previous one to run
} // properly
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
// If we don't find the values, we default use the previous one to run major,
// properly minor,
printf( nGpuArchCoresPerSM[index - 1].Cores);
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", return nGpuArchCoresPerSM[index - 1].Cores;
major, minor, nGpuArchCoresPerSM[index - 1].Cores);
return nGpuArchCoresPerSM[index - 1].Cores;
} }
// end of GPU Architecture definitions // end of GPU Architecture definitions
#ifdef __cuda_cuda_h__ #ifdef __cuda_cuda_h__
// General GPU Device CUDA Initialization // General GPU Device CUDA Initialization
inline int gpuDeviceInitDRV(int ARGC, const char **ARGV) { inline int gpuDeviceInitDRV(int ARGC, const char **ARGV)
int cuDevice = 0; {
int deviceCount = 0; int cuDevice = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); int deviceCount = 0;
checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
checkCudaErrors(cuDeviceGetCount(&deviceCount)); checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0) { if (deviceCount == 0) {
fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n"); fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
int dev = 0; int dev = 0;
dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device="); dev = getCmdLineArgumentInt(ARGC, (const char **)ARGV, "device=");
if (dev < 0) { if (dev < 0) {
dev = 0; dev = 0;
} }
if (dev > deviceCount - 1) { if (dev > deviceCount - 1) {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
deviceCount); fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
fprintf(stderr, fprintf(stderr, "\n");
">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", return -dev;
dev); }
fprintf(stderr, "\n");
return -dev;
}
checkCudaErrors(cuDeviceGet(&cuDevice, dev)); checkCudaErrors(cuDeviceGet(&cuDevice, dev));
char name[100]; char name[100];
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
int computeMode; int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev); getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
if (computeMode == CU_COMPUTEMODE_PROHIBITED) { if (computeMode == CU_COMPUTEMODE_PROHIBITED) {
fprintf(stderr, fprintf(stderr,
"Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no " "Error: device is running in <CU_COMPUTEMODE_PROHIBITED>, no "
"threads can use this CUDA Device.\n"); "threads can use this CUDA Device.\n");
return -1; return -1;
} }
if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) { if (checkCmdLineFlag(ARGC, (const char **)ARGV, "quiet") == false) {
printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name); printf("gpuDeviceInitDRV() Using CUDA Device [%d]: %s\n", dev, name);
} }
return dev; return dev;
} }
// This function returns the best GPU based on performance // This function returns the best GPU based on performance
inline int gpuGetMaxGflopsDeviceIdDRV() { inline int gpuGetMaxGflopsDeviceIdDRV()
CUdevice current_device = 0; {
CUdevice max_perf_device = 0; CUdevice current_device = 0;
int device_count = 0; CUdevice max_perf_device = 0;
int sm_per_multiproc = 0; int device_count = 0;
unsigned long long max_compute_perf = 0; int sm_per_multiproc = 0;
int major = 0; unsigned long long max_compute_perf = 0;
int minor = 0; int major = 0;
int multiProcessorCount; int minor = 0;
int clockRate; int multiProcessorCount;
int devices_prohibited = 0; int clockRate;
int devices_prohibited = 0;
cuInit(0, __CUDA_API_VERSION); cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count)); checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) { if (device_count == 0) {
fprintf(stderr, fprintf(stderr, "gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n");
"gpuGetMaxGflopsDeviceIdDRV error: no devices supporting CUDA\n"); exit(EXIT_FAILURE);
exit(EXIT_FAILURE);
}
// Find the best CUDA capable GPU device
current_device = 0;
while (current_device < device_count) {
checkCudaErrors(cuDeviceGetAttribute(
&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE,
current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1;
} else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
}
unsigned long long compute_perf =
(unsigned long long)(multiProcessorCount * sm_per_multiproc *
clockRate);
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
} else {
devices_prohibited++;
} }
++current_device; // Find the best CUDA capable GPU device
} current_device = 0;
if (devices_prohibited == device_count) { while (current_device < device_count) {
fprintf(stderr, checkCudaErrors(
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode " cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device));
"prohibited.\n"); checkCudaErrors(cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device));
exit(EXIT_FAILURE); checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
} checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
return max_perf_device; int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device);
if (computeMode != CU_COMPUTEMODE_PROHIBITED) {
if (major == 9999 && minor == 9999) {
sm_per_multiproc = 1;
}
else {
sm_per_multiproc = _ConvertSMVer2CoresDRV(major, minor);
}
unsigned long long compute_perf = (unsigned long long)(multiProcessorCount * sm_per_multiproc * clockRate);
if (compute_perf > max_compute_perf) {
max_compute_perf = compute_perf;
max_perf_device = current_device;
}
}
else {
devices_prohibited++;
}
++current_device;
}
if (devices_prohibited == device_count) {
fprintf(stderr,
"gpuGetMaxGflopsDeviceIdDRV error: all devices have compute mode "
"prohibited.\n");
exit(EXIT_FAILURE);
}
return max_perf_device;
} }
// General initialization call to pick the best CUDA Device // General initialization call to pick the best CUDA Device
inline CUdevice findCudaDeviceDRV(int argc, const char **argv) { inline CUdevice findCudaDeviceDRV(int argc, const char **argv)
CUdevice cuDevice; {
int devID = 0; CUdevice cuDevice;
int devID = 0;
// If the command-line has a device number specified, use it // If the command-line has a device number specified, use it
if (checkCmdLineFlag(argc, (const char **)argv, "device")) { if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
devID = gpuDeviceInitDRV(argc, argv); devID = gpuDeviceInitDRV(argc, argv);
if (devID < 0) { if (devID < 0) {
printf("exiting...\n"); printf("exiting...\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
}
}
else {
// Otherwise pick the device with highest Gflops/s
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
cuDeviceGetName(name, 100, cuDevice);
printf("> Using CUDA Device [%d]: %s\n", devID, name);
} }
} else {
// Otherwise pick the device with highest Gflops/s
char name[100];
devID = gpuGetMaxGflopsDeviceIdDRV();
checkCudaErrors(cuDeviceGet(&cuDevice, devID));
cuDeviceGetName(name, 100, cuDevice);
printf("> Using CUDA Device [%d]: %s\n", devID, name);
}
cuDeviceGet(&cuDevice, devID); cuDeviceGet(&cuDevice, devID);
return cuDevice; return cuDevice;
} }
inline CUdevice findIntegratedGPUDrv() { inline CUdevice findIntegratedGPUDrv()
CUdevice current_device = 0; {
int device_count = 0; CUdevice current_device = 0;
int devices_prohibited = 0; int device_count = 0;
int isIntegrated; int devices_prohibited = 0;
int isIntegrated;
cuInit(0, __CUDA_API_VERSION); cuInit(0, __CUDA_API_VERSION);
checkCudaErrors(cuDeviceGetCount(&device_count)); checkCudaErrors(cuDeviceGetCount(&device_count));
if (device_count == 0) { if (device_count == 0) {
fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
// Find the integrated GPU which is compute capable
while (current_device < device_count) {
int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute(
&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
checkCudaErrors(cuDeviceGetAttribute(
&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
// If GPU is integrated and is not running on Compute Mode prohibited use
// that
if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
current_device));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
current_device, deviceName, major, minor);
return current_device;
} else {
devices_prohibited++;
} }
current_device++; // Find the integrated GPU which is compute capable
} while (current_device < device_count) {
int computeMode = -1;
checkCudaErrors(cuDeviceGetAttribute(&isIntegrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, current_device));
checkCudaErrors(cuDeviceGetAttribute(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, current_device));
if (devices_prohibited == device_count) { // If GPU is integrated and is not running on Compute Mode prohibited use
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n"); // that
exit(EXIT_FAILURE); if (isIntegrated && (computeMode != CU_COMPUTEMODE_PROHIBITED)) {
} int major = 0, minor = 0;
char deviceName[256];
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, current_device));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, current_device));
checkCudaErrors(cuDeviceGetName(deviceName, 256, current_device));
printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device, deviceName, major, minor);
return -1; return current_device;
}
else {
devices_prohibited++;
}
current_device++;
}
if (devices_prohibited == device_count) {
fprintf(stderr, "CUDA error: No Integrated CUDA capable GPU found.\n");
exit(EXIT_FAILURE);
}
return -1;
} }
// General check for CUDA GPU SM Capabilities // General check for CUDA GPU SM Capabilities
inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, inline bool checkCudaCapabilitiesDRV(int major_version, int minor_version, int devID)
int devID) { {
CUdevice cuDevice; CUdevice cuDevice;
char name[256]; char name[256];
int major = 0, minor = 0; int major = 0, minor = 0;
checkCudaErrors(cuDeviceGet(&cuDevice, devID)); checkCudaErrors(cuDeviceGet(&cuDevice, devID));
checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
if ((major > major_version) || if ((major > major_version) || (major == major_version && minor >= minor_version)) {
(major == major_version && minor >= minor_version)) { printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, major, minor);
printf("> Device %d: <%16s >, Compute SM %d.%d detected\n", devID, name, return true;
major, minor); }
return true; else {
} else { printf("No GPU device was found that can support CUDA compute capability "
printf( "%d.%d.\n",
"No GPU device was found that can support CUDA compute capability " major_version,
"%d.%d.\n", minor_version);
major_version, minor_version); return false;
return false; }
}
} }
#endif #endif
// end of CUDA Helper Functions // end of CUDA Helper Functions
#endif // HELPER_CUDA_DRVAPI_H
#endif // HELPER_CUDA_DRVAPI_H

View File

@ -34,8 +34,8 @@
#define WA (4 * block_size) // Matrix A width #define WA (4 * block_size) // Matrix A width
#define HA (6 * block_size) // Matrix A height #define HA (6 * block_size) // Matrix A height
#define WB (4 * block_size) // Matrix B width #define WB (4 * block_size) // Matrix B width
#define HB WA // Matrix B height #define HB WA // Matrix B height
#define WC WB // Matrix C width #define WC WB // Matrix C width
#define HC HA // Matrix C height #define HC HA // Matrix C height
#endif // _MATRIXMUL_H_ #endif // _MATRIXMUL_H_

View File

@ -43,10 +43,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, CUDA // includes, CUDA
#include "cuda_drvapi_dynlink.h" #include "cuda_drvapi_dynlink.h"
@ -60,7 +60,7 @@
extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int); extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
#if defined _MSC_VER #if defined _MSC_VER
#pragma warning (disable : 4312) #pragma warning(disable : 4312)
#endif #endif
@ -68,7 +68,7 @@ extern "C" void computeGold(float *, const float *, const float *, unsigned int,
// Globals // Globals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
CUcontext g_cuContext; CUcontext g_cuContext;
bool noprompt = false; bool noprompt = false;
static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)"; static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
@ -78,8 +78,7 @@ static const char *sSDKsample = "matrixMulDynlinkJIT (CUDA dynamic linking)";
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void randomInit(float *data, size_t size) void randomInit(float *data, size_t size)
{ {
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i) {
{
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }
} }
@ -89,33 +88,29 @@ void randomInit(float *data, size_t size)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out) CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size_out)
{ {
CUresult status; CUresult status;
CUdevice cuDevice; CUdevice cuDevice;
CUmodule cuModule; CUmodule cuModule;
CUfunction cuFunction; CUfunction cuFunction;
int major, minor, block_size, devID = 0; int major, minor, block_size, devID = 0;
char deviceName[256]; char deviceName[256];
// link to cuda driver dynamically // link to cuda driver dynamically
checkCudaErrors(cuInit(0, __CUDA_API_VERSION)); checkCudaErrors(cuInit(0, __CUDA_API_VERSION));
// This assumes that the user is attempting to specify a explicit device -device=n // This assumes that the user is attempting to specify a explicit device -device=n
if (argc > 1) if (argc > 1) {
{
bool bFound = false; bool bFound = false;
for (int param=0; param < argc; param++) for (int param = 0; param < argc; param++) {
{ if (!strncmp(argv[param], "-device", 7)) {
if (!strncmp(argv[param], "-device", 7)) int i = (int)strlen(argv[1]);
{
int i=(int)strlen(argv[1]);
while (argv[1][i] != '=') while (argv[1][i] != '=') {
{
i--; i--;
} }
devID = atoi(&argv[1][++i]); devID = atoi(&argv[1][++i]);
bFound = true; bFound = true;
} }
@ -128,16 +123,15 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
int deviceCount = 0; int deviceCount = 0;
checkCudaErrors(cuDeviceGetCount(&deviceCount)); checkCudaErrors(cuDeviceGetCount(&deviceCount));
if (deviceCount == 0) if (deviceCount == 0) {
{
fprintf(stderr, "No devices supporting CUDA detected, exiting...\n"); fprintf(stderr, "No devices supporting CUDA detected, exiting...\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
if (devID < 0) devID = 0; if (devID < 0)
devID = 0;
if (devID > deviceCount -1) if (devID > deviceCount - 1) {
{
fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount); fprintf(stderr, "initCUDA (Device=%d) invalid GPU device. %d GPU device(s) detected.\n\n", devID, deviceCount);
status = CUDA_ERROR_NOT_FOUND; status = CUDA_ERROR_NOT_FOUND;
@ -153,14 +147,13 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice)); checkCudaErrors(cuDeviceGetName(deviceName, 256, cuDevice));
printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor); printf("> Device %d: \"%s\" with Compute %d.%d capability\n", cuDevice, deviceName, major, minor);
block_size = 32; block_size = 32;
*block_size_out = block_size; *block_size_out = block_size;
// create context for picked device // create context for picked device
status = cuCtxCreate(&g_cuContext, 0, cuDevice); status = cuCtxCreate(&g_cuContext, 0, cuDevice);
if (CUDA_SUCCESS != status) if (CUDA_SUCCESS != status) {
{
cuCtxDestroy(g_cuContext); cuCtxDestroy(g_cuContext);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
@ -169,53 +162,53 @@ CUresult initCUDA(int argc, char **argv, CUfunction *pMatrixMul, int *block_size
{ {
// in this branch we use compilation with parameters // in this branch we use compilation with parameters
const unsigned int jitNumOptions = 3; const unsigned int jitNumOptions = 3;
CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
void **jitOptVals = new void *[jitNumOptions]; void **jitOptVals = new void *[jitNumOptions];
// set up size of compilation log buffer // set up size of compilation log buffer
jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES; jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
int jitLogBufferSize = 1024; int jitLogBufferSize = 1024;
jitOptVals[0] = (void *)(size_t)jitLogBufferSize; jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
// set up pointer to the compilation log buffer // set up pointer to the compilation log buffer
jitOptions[1] = CU_JIT_INFO_LOG_BUFFER; jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
char *jitLogBuffer = new char[jitLogBufferSize]; char *jitLogBuffer = new char[jitLogBufferSize];
jitOptVals[1] = jitLogBuffer; jitOptVals[1] = jitLogBuffer;
// set up pointer to set the Maximum # of registers for a particular kernel // set up pointer to set the Maximum # of registers for a particular kernel
jitOptions[2] = CU_JIT_MAX_REGISTERS; jitOptions[2] = CU_JIT_MAX_REGISTERS;
int jitRegCount = 32; int jitRegCount = 32;
jitOptVals[2] = (void *)(size_t)jitRegCount; jitOptVals[2] = (void *)(size_t)jitRegCount;
// compile with set parameters // compile with set parameters
printf("> Compiling CUDA module\n"); printf("> Compiling CUDA module\n");
#if defined(_WIN64) || defined(__LP64__) #if defined(_WIN64) || defined(__LP64__)
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_64_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#else #else
status = cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals); status =
cuModuleLoadDataEx(&cuModule, matrixMul_kernel_32_ptxdump, jitNumOptions, jitOptions, (void **)jitOptVals);
#endif #endif
printf("> PTX JIT log:\n%s\n", jitLogBuffer); printf("> PTX JIT log:\n%s\n", jitLogBuffer);
delete [] jitOptions; delete[] jitOptions;
delete [] jitOptVals; delete[] jitOptVals;
delete [] jitLogBuffer; delete[] jitLogBuffer;
} }
if (CUDA_SUCCESS != status) if (CUDA_SUCCESS != status) {
{
printf("Error while compiling PTX\n"); printf("Error while compiling PTX\n");
cuCtxDestroy(g_cuContext); cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// retrieve CUDA function from the compiled module // retrieve CUDA function from the compiled module
status = cuModuleGetFunction(&cuFunction, cuModule, status = cuModuleGetFunction(
(block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit"); &cuFunction, cuModule, (block_size == 16) ? "matrixMul_bs16_32bit" : "matrixMul_bs32_32bit");
if (CUDA_SUCCESS != status) if (CUDA_SUCCESS != status) {
{
cuCtxDestroy(g_cuContext); cuCtxDestroy(g_cuContext);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -233,21 +226,21 @@ int main(int argc, char **argv)
printf("[ %s ]\n", sSDKsample); printf("[ %s ]\n", sSDKsample);
// initialize CUDA // initialize CUDA
CUfunction matrixMul = NULL; CUfunction matrixMul = NULL;
int block_size = 0; int block_size = 0;
checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size)); checkCudaErrors(initCUDA(argc, argv, &matrixMul, &block_size));
// set seed for rand() // set seed for rand()
srand(2006); srand(2006);
// allocate host memory for matrices A and B // allocate host memory for matrices A and B
size_t size_A = WA * HA; size_t size_A = WA * HA;
size_t mem_size_A = sizeof(float) * size_A; size_t mem_size_A = sizeof(float) * size_A;
size_t size_B = WB * HB; size_t size_B = WB * HB;
size_t mem_size_B = sizeof(float) * size_B; size_t mem_size_B = sizeof(float) * size_B;
float *h_A = (float *) malloc(mem_size_A); float *h_A = (float *)malloc(mem_size_A);
float *h_B = (float *) malloc(mem_size_B); float *h_B = (float *)malloc(mem_size_B);
// initialize host memory // initialize host memory
randomInit(h_A, size_A); randomInit(h_A, size_A);
@ -264,26 +257,24 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B)); checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// allocate device memory for result // allocate device memory for result
size_t size_C = WC * HC; size_t size_C = WC * HC;
size_t mem_size_C = sizeof(float) * size_C; size_t mem_size_C = sizeof(float) * size_C;
CUdeviceptr d_C; CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C)); checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// allocate mem for the result on host side // allocate mem for the result on host side
float *h_C = (float *) malloc(mem_size_C); float *h_C = (float *)malloc(mem_size_C);
#if __CUDA_API_VERSION >= 4000 #if __CUDA_API_VERSION >= 4000
{ {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method) // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel Launching (simpler method)
int Matrix_Width_A = WA; int Matrix_Width_A = WA;
int Matrix_Width_B = WB; int Matrix_Width_B = WB;
void *args[5] = { &d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B }; void *args[5] = {&d_C, &d_A, &d_B, &Matrix_Width_A, &Matrix_Width_B};
checkCudaErrors(cuLaunchKernel(matrixMul, (WC/block_size), (HC/block_size), 1, checkCudaErrors(cuLaunchKernel(
block_size , block_size , 1, matrixMul, (WC / block_size), (HC / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
0,
NULL, args, NULL));
} }
#else // __CUDA_API_VERSION <= 3020 #else // __CUDA_API_VERSION <= 3020
{ {
@ -312,7 +303,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuParamSetSize(matrixMul, offset)); checkCudaErrors(cuParamSetSize(matrixMul, offset));
checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1)); checkCudaErrors(cuFuncSetBlockShape(matrixMul, block_size, block_size, 1));
checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2*block_size*block_size*sizeof(float))); checkCudaErrors(cuFuncSetSharedSize(matrixMul, 2 * block_size * block_size * sizeof(float)));
// set execution configuration for the CUDA kernel // set execution configuration for the CUDA kernel
checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size)); checkCudaErrors(cuLaunchGrid(matrixMul, WC / block_size, HC / block_size));
@ -322,19 +313,18 @@ int main(int argc, char **argv)
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
// copy result from device to host // copy result from device to host
checkCudaErrors(cuMemcpyDtoH((void *) h_C, d_C, mem_size_C)); checkCudaErrors(cuMemcpyDtoH((void *)h_C, d_C, mem_size_C));
// compute reference solution // compute reference solution
float *reference = (float *) malloc(mem_size_C); float *reference = (float *)malloc(mem_size_C);
computeGold(reference, h_A, h_B, HA, WA, WB); computeGold(reference, h_A, h_B, HA, WA, WB);
// check result // check result
float diff=0.0f; float diff = 0.0f;
for (unsigned int i=0; i<size_C; i++) for (unsigned int i = 0; i < size_C; i++) {
{
float tmp = reference[i] - h_C[i]; float tmp = reference[i] - h_C[i];
diff += tmp*tmp; diff += tmp * tmp;
} }
int res = (diff / (float)size_C < 1e-6f); int res = (diff / (float)size_C < 1e-6f);
@ -349,7 +339,7 @@ int main(int argc, char **argv)
checkCudaErrors(cuMemFree(d_C)); checkCudaErrors(cuMemFree(d_C));
checkCudaErrors(cuCtxDestroy(g_cuContext)); checkCudaErrors(cuCtxDestroy(g_cuContext));
printf("Test run %s\n", (1==res) ? "success!" : "failed!"); printf("Test run %s\n", (1 == res) ? "success!" : "failed!");
exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE); exit((1 == res) ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -28,8 +28,7 @@
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// export C interface // export C interface
extern "C" extern "C" void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
void computeGold(float *, const float *, const float *, unsigned int, unsigned int, unsigned int);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set //! Compute reference data set
@ -40,16 +39,13 @@ void computeGold(float *, const float *, const float *, unsigned int, unsigned i
//! @param hA height of matrix A //! @param hA height of matrix A
//! @param wB width of matrix B //! @param wB width of matrix B
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void void computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
computeGold(float *C, const float *A, const float *B, unsigned int hA, unsigned int wA, unsigned int wB)
{ {
for (unsigned int i = 0; i < hA; ++i) for (unsigned int i = 0; i < hA; ++i)
for (unsigned int j = 0; j < wB; ++j) for (unsigned int j = 0; j < wB; ++j) {
{
double sum = 0; double sum = 0;
for (unsigned int k = 0; k < wA; ++k) for (unsigned int k = 0; k < wA; ++k) {
{
double a = A[i * wA + k]; double a = A[i * wA + k];
double b = B[k * wB + j]; double b = B[k * wB + j];
sum += a * b; sum += a * b;

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_32_ptxdump_h__ #define __matrixMul_kernel_32_ptxdump_h__
#if defined __cplusplus #if defined __cplusplus
extern "C" { extern "C"
{
#endif #endif
extern unsigned char matrixMul_kernel_32_ptxdump[25784]; extern unsigned char matrixMul_kernel_32_ptxdump[25784];

View File

@ -32,7 +32,8 @@
#define __matrixMul_kernel_64_ptxdump_h__ #define __matrixMul_kernel_64_ptxdump_h__
#if defined __cplusplus #if defined __cplusplus
extern "C" { extern "C"
{
#endif #endif
extern unsigned char matrixMul_kernel_64_ptxdump[26489]; extern unsigned char matrixMul_kernel_64_ptxdump[26489];

View File

@ -42,207 +42,208 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "nvrtc_helper.h" #include "nvrtc_helper.h"
// Helper functions and utilities to work with CUDA // Helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
void constantInit(float *data, int size, float val) { void constantInit(float *data, int size, float val)
for (int i = 0; i < size; ++i) { {
data[i] = val; for (int i = 0; i < size; ++i) {
} data[i] = val;
}
} }
/** /**
* Run a simple test of matrix multiplication using CUDA * Run a simple test of matrix multiplication using CUDA
*/ */
int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, int matrixMultiply(int argc, char **argv, int block_size, dim3 &dimsA, dim3 &dimsB)
dim3 &dimsB) { {
// Allocate host memory for matrices A and B // Allocate host memory for matrices A and B
unsigned int size_A = dimsA.x * dimsA.y; unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A; unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A); float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = dimsB.x * dimsB.y; unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B; unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B); float *h_B = (float *)malloc(mem_size_B);
// Initialize host memory // Initialize host memory
const float valB = 0.01f; const float valB = 0.01f;
constantInit(h_A, size_A, 1.0f); constantInit(h_A, size_A, 1.0f);
constantInit(h_B, size_B, valB); constantInit(h_B, size_B, valB);
// Allocate device memory // Allocate device memory
CUdeviceptr d_A, d_B, d_C; CUdeviceptr d_A, d_B, d_C;
char *cubin, *kernel_file; char *cubin, *kernel_file;
size_t cubinSize; size_t cubinSize;
kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]); kernel_file = sdkFindFilePath("matrixMul_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 1);
CUmodule module = loadCUBIN(cubin, argc, argv); CUmodule module = loadCUBIN(cubin, argc, argv);
// Allocate host matrix C // Allocate host matrix C
dim3 dimsC(dimsB.x, dimsA.y, 1); dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float); unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float *h_C = (float *)malloc(mem_size_C); float *h_C = (float *)malloc(mem_size_C);
if (h_C == NULL) { if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C!\n"); fprintf(stderr, "Failed to allocate host matrix C!\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
// copy host memory to device
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// Setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
// Create and start timer
printf("Computing result using CUDA Kernel...\n");
CUfunction kernel_addr;
if (block_size == 16) {
checkCudaErrors(
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
} else {
checkCudaErrors(
cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x,
(void *)&dimsB.x};
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
checkCudaErrors(
cuLaunchKernel(kernel_addr, grid.x, grid.y, grid.z, /* grid dim */
threads.x, threads.y, threads.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
}
// Copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i,
h_C[i], dimsA.x * valB, eps);
correct = false;
} }
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL"); checkCudaErrors(cuMemAlloc(&d_A, mem_size_A));
checkCudaErrors(cuMemAlloc(&d_B, mem_size_B));
checkCudaErrors(cuMemAlloc(&d_C, mem_size_C));
printf( // copy host memory to device
"\nNOTE: The CUDA Samples are not meant for performance measurements. " checkCudaErrors(cuMemcpyHtoD(d_A, h_A, mem_size_A));
"Results may vary when GPU Boost is enabled.\n"); checkCudaErrors(cuMemcpyHtoD(d_B, h_B, mem_size_B));
// Clean up memory // Setup execution parameters
free(h_A); dim3 threads(block_size, block_size);
free(h_B); dim3 grid(dimsB.x / threads.x, dimsA.y / threads.y);
free(h_C);
checkCudaErrors(cuMemFree(d_A)); // Create and start timer
checkCudaErrors(cuMemFree(d_B)); printf("Computing result using CUDA Kernel...\n");
checkCudaErrors(cuMemFree(d_C));
if (correct) { CUfunction kernel_addr;
return EXIT_SUCCESS; if (block_size == 16) {
} else { checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block16"));
return EXIT_FAILURE; }
} else {
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "matrixMulCUDA_block32"));
}
void *arr[] = {(void *)&d_C, (void *)&d_A, (void *)&d_B, (void *)&dimsA.x, (void *)&dimsB.x};
// Execute the kernel
int nIter = 300;
for (int j = 0; j < nIter; j++) {
checkCudaErrors(cuLaunchKernel(kernel_addr,
grid.x,
grid.y,
grid.z, /* grid dim */
threads.x,
threads.y,
threads.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
}
// Copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, mem_size_C));
printf("Checking computed result for correctness: ");
bool correct = true;
// test relative error by the formula
// |<x, y>_cpu - <x,y>_gpu|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
for (int i = 0; i < (int)(dimsC.x * dimsC.y); i++) {
double abs_err = fabs(h_C[i] - (dimsA.x * valB));
double dot_length = dimsA.x;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n", i, h_C[i], dimsA.x * valB, eps);
correct = false;
}
}
printf("%s\n", correct ? "Result = PASS" : "Result = FAIL");
printf("\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
// Clean up memory
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
if (correct) {
return EXIT_SUCCESS;
}
else {
return EXIT_FAILURE;
}
} }
/** /**
* Program main * Program main
*/ */
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("[Matrix Multiply Using CUDA] - Starting...\n"); {
printf("[Matrix Multiply Using CUDA] - Starting...\n");
if (checkCmdLineFlag(argc, (const char **)argv, "help") || if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "?")) {
checkCmdLineFlag(argc, (const char **)argv, "?")) { printf("Usage -device=n (n >= 0 for deviceID)\n");
printf("Usage -device=n (n >= 0 for deviceID)\n"); printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n");
printf(" -wA=WidthA -hA=HeightA (Width x Height of Matrix A)\n"); printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n");
printf(" -wB=WidthB -hB=HeightB (Width x Height of Matrix B)\n"); printf(" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
printf(
" Note: Outer matrix dimensions of A & B matrices must be equal.\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
int block_size = 32; int block_size = 32;
// original: // original:
dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1); dim3 dimsA(5 * 2 * block_size, 5 * 2 * block_size, 1);
dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1); dim3 dimsB(5 * 4 * block_size, 5 * 2 * block_size, 1);
// reduce sizes to avoid running out of memory // reduce sizes to avoid running out of memory
// dim3 dimsA(32,32, 1); // dim3 dimsA(32,32, 1);
// dim3 dimsB(32,32,1); // dim3 dimsB(32,32,1);
// width of Matrix A // width of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "wA")) { if (checkCmdLineFlag(argc, (const char **)argv, "wA")) {
dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA"); dimsA.x = getCmdLineArgumentInt(argc, (const char **)argv, "wA");
} }
// height of Matrix A // height of Matrix A
if (checkCmdLineFlag(argc, (const char **)argv, "hA")) { if (checkCmdLineFlag(argc, (const char **)argv, "hA")) {
dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA"); dimsA.y = getCmdLineArgumentInt(argc, (const char **)argv, "hA");
} }
// width of Matrix B // width of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "wB")) { if (checkCmdLineFlag(argc, (const char **)argv, "wB")) {
dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB"); dimsB.x = getCmdLineArgumentInt(argc, (const char **)argv, "wB");
} }
// height of Matrix B // height of Matrix B
if (checkCmdLineFlag(argc, (const char **)argv, "hB")) { if (checkCmdLineFlag(argc, (const char **)argv, "hB")) {
dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB"); dimsB.y = getCmdLineArgumentInt(argc, (const char **)argv, "hB");
} }
if (dimsA.x != dimsB.y) { if (dimsA.x != dimsB.y) {
printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", printf("Error: outer matrix dimensions must be equal. (%d != %d)\n", dimsA.x, dimsB.y);
dimsA.x, dimsB.y); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, printf("MatrixA(%d,%d), MatrixB(%d,%d)\n", dimsA.x, dimsA.y, dimsB.x, dimsB.y);
dimsB.y);
int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB); int matrix_result = matrixMultiply(argc, argv, block_size, dimsA, dimsB);
exit(matrix_result); exit(matrix_result);
} }

View File

@ -48,84 +48,83 @@
#include <cooperative_groups.h> #include <cooperative_groups.h>
template <int BLOCK_SIZE> template <int BLOCK_SIZE> __device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB)
__device__ void matrixMulCUDA(float *C, float *A, float *B, int wA, int wB) { {
// Handle to thread block group // Handle to thread block group
cooperative_groups::thread_block cta = cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
cooperative_groups::this_thread_block(); // Block index
// Block index int bx = blockIdx.x;
int bx = blockIdx.x; int by = blockIdx.y;
int by = blockIdx.y;
// Thread index // Thread index
int tx = threadIdx.x; int tx = threadIdx.x;
int ty = threadIdx.y; int ty = threadIdx.y;
// Index of the first sub-matrix of A processed by the block // Index of the first sub-matrix of A processed by the block
int aBegin = wA * BLOCK_SIZE * by; int aBegin = wA * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block // Index of the last sub-matrix of A processed by the block
int aEnd = aBegin + wA - 1; int aEnd = aBegin + wA - 1;
// Step size used to iterate through the sub-matrices of A // Step size used to iterate through the sub-matrices of A
int aStep = BLOCK_SIZE; int aStep = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block // Index of the first sub-matrix of B processed by the block
int bBegin = BLOCK_SIZE * bx; int bBegin = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B // Step size used to iterate through the sub-matrices of B
int bStep = BLOCK_SIZE * wB; int bStep = BLOCK_SIZE * wB;
// Csub is used to store the element of the block sub-matrix // Csub is used to store the element of the block sub-matrix
// that is computed by the thread // that is computed by the thread
float Csub = 0; float Csub = 0;
// Loop over all the sub-matrices of A and B // Loop over all the sub-matrices of A and B
// required to compute the block sub-matrix // required to compute the block sub-matrix
for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { for (int a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) {
// Declaration of the shared memory array As used to // Declaration of the shared memory array As used to
// store the sub-matrix of A // store the sub-matrix of A
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
// Declaration of the shared memory array Bs used to // Declaration of the shared memory array Bs used to
// store the sub-matrix of B // store the sub-matrix of B
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE]; __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Load the matrices from device memory // Load the matrices from device memory
// to shared memory; each thread loads // to shared memory; each thread loads
// one element of each matrix // one element of each matrix
As[ty][tx] = A[a + wA * ty + tx]; As[ty][tx] = A[a + wA * ty + tx];
Bs[ty][tx] = B[b + wB * ty + tx]; Bs[ty][tx] = B[b + wB * ty + tx];
// Synchronize to make sure the matrices are loaded // Synchronize to make sure the matrices are loaded
cooperative_groups::sync(cta); cooperative_groups::sync(cta);
// Multiply the two matrices together; // Multiply the two matrices together;
// each thread computes one element // each thread computes one element
// of the block sub-matrix // of the block sub-matrix
#pragma unroll #pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k) { for (int k = 0; k < BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx]; Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
cooperative_groups::sync(cta);
} }
// Synchronize to make sure that the preceding // Write the block sub-matrix to device memory;
// computation is done before loading two new // each thread writes one element
// sub-matrices of A and B in the next iteration int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
cooperative_groups::sync(cta); C[c + wB * ty + tx] = Csub;
}
// Write the block sub-matrix to device memory;
// each thread writes one element
int c = wB * BLOCK_SIZE * by + BLOCK_SIZE * bx;
C[c + wB * ty + tx] = Csub;
} }
extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, extern "C" __global__ void matrixMulCUDA_block16(float *C, float *A, float *B, int wA, int wB)
int wA, int wB) { {
matrixMulCUDA<16>(C, A, B, wA, wB); matrixMulCUDA<16>(C, A, B, wA, wB);
} }
extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, extern "C" __global__ void matrixMulCUDA_block32(float *C, float *A, float *B, int wA, int wB)
int wA, int wB) { {
matrixMulCUDA<32>(C, A, B, wA, wB); matrixMulCUDA<32>(C, A, B, wA, wB);
} }

View File

@ -28,252 +28,254 @@
#include <cooperative_groups.h> #include <cooperative_groups.h>
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#include <helper_cuda.h>
#include <assert.h> #include <assert.h>
#include <helper_cuda.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, inline __device__ void Comparator(uint &keyA, uint &valA, uint &keyB, uint &valB, uint arrowDir)
uint &valB, uint arrowDir) { {
uint t; uint t;
if ((keyA > keyB) == arrowDir) { if ((keyA > keyB) == arrowDir) {
t = keyA; t = keyA;
keyA = keyB; keyA = keyB;
keyB = t; keyB = t;
t = valA; t = valA;
valA = valB; valA = valB;
valB = t; valB = t;
} }
} }
__global__ void bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, __global__ void
uint *d_SrcKey, uint *d_SrcVal, bitonicSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength, uint sortDir)
uint arrayLength, uint sortDir) { {
// Handle to thread block group // Handle to thread block group
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
// Shared memory storage for one or more short vectors // Shared memory storage for one or more short vectors
__shared__ uint s_key[SHARED_SIZE_LIMIT]; __shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT]; __shared__ uint s_val[SHARED_SIZE_LIMIT];
// Offset to the beginning of subbatch and load data // Offset to the beginning of subbatch and load data
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0]; s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0]; s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint size = 2; size < arrayLength; size <<= 1) { for (uint size = 2; size < arrayLength; size <<= 1) {
// Bitonic merge // Bitonic merge
uint dir = (threadIdx.x & (size / 2)) != 0; uint dir = (threadIdx.x & (size / 2)) != 0;
for (uint stride = size / 2; stride > 0; stride >>= 1) { for (uint stride = size / 2; stride > 0; stride >>= 1) {
cg::sync(cta); cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], dir);
s_val[pos + stride], dir); }
} }
}
// ddd == sortDir for the last bitonic merge step // ddd == sortDir for the last bitonic merge step
{ {
for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) { for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) {
cg::sync(cta); cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], Comparator(s_key[pos + 0], s_val[pos + 0], s_key[pos + stride], s_val[pos + stride], sortDir);
s_val[pos + stride], sortDir); }
} }
}
cg::sync(cta); cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0]; d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0]; d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] = d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
} }
// Helper function (also used by odd-even merge sort) // Helper function (also used by odd-even merge sort)
extern "C" uint factorRadix2(uint *log2L, uint L) { extern "C" uint factorRadix2(uint *log2L, uint L)
if (!L) { {
*log2L = 0; if (!L) {
return 0; *log2L = 0;
} else { return 0;
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++) }
; else {
for (*log2L = 0; (L & 1) == 0; L >>= 1, *log2L++)
;
return L; return L;
} }
} }
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal, extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint batchSize, uint arrayLength, uint *d_SrcKey,
uint sortDir) { uint *d_SrcVal,
// Nothing to sort uint batchSize,
if (arrayLength < 2) { uint arrayLength,
return; uint sortDir)
} {
// Nothing to sort
if (arrayLength < 2) {
return;
}
// Only power-of-two array lengths are supported by this implementation // Only power-of-two array lengths are supported by this implementation
uint log2L; uint log2L;
uint factorizationRemainder = factorRadix2(&log2L, arrayLength); uint factorizationRemainder = factorRadix2(&log2L, arrayLength);
assert(factorizationRemainder == 1); assert(factorizationRemainder == 1);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2; uint threadCount = SHARED_SIZE_LIMIT / 2;
assert(arrayLength <= SHARED_SIZE_LIMIT); assert(arrayLength <= SHARED_SIZE_LIMIT);
assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0); assert((batchSize * arrayLength) % SHARED_SIZE_LIMIT == 0);
bitonicSortSharedKernel<<<blockCount, threadCount>>>( bitonicSortSharedKernel<<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir);
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength, sortDir); getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
getLastCudaError("bitonicSortSharedKernel<<<>>> failed!\n");
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals // Merge step 3: merge elementary intervals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) { static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint getSampleCount(uint dividend) { static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
return iDivUp(dividend, SAMPLE_STRIDE);
template <uint sortDir>
static inline __device__ void
ComparatorExtended(uint &keyA, uint &valA, uint &flagA, uint &keyB, uint &valB, uint &flagB, uint arrowDir)
{
uint t;
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || ((arrowDir == sortDir) && (flagA == 1))
|| ((arrowDir != sortDir) && (flagB == 1))) {
t = keyA;
keyA = keyB;
keyB = t;
t = valA;
valA = valB;
valB = t;
t = flagA;
flagA = flagB;
flagB = t;
}
} }
template <uint sortDir> template <uint sortDir>
static inline __device__ void ComparatorExtended(uint &keyA, uint &valA, __global__ void bitonicMergeElementaryIntervalsKernel(uint *d_DstKey,
uint &flagA, uint &keyB, uint *d_DstVal,
uint &valB, uint &flagB, uint *d_SrcKey,
uint arrowDir) { uint *d_SrcVal,
uint t; uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
if ((!(flagA || flagB) && ((keyA > keyB) == arrowDir)) || const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
((arrowDir == sortDir) && (flagA == 1)) || const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
((arrowDir != sortDir) && (flagB == 1))) { d_SrcKey += segmentBase;
t = keyA; d_SrcVal += segmentBase;
keyA = keyB; d_DstKey += segmentBase;
keyB = t; d_DstVal += segmentBase;
t = valA;
valA = valB;
valB = t;
t = flagA;
flagA = flagB;
flagB = t;
}
}
template <uint sortDir> // Set up threadblock-wide parameters
__global__ void bitonicMergeElementaryIntervalsKernel( __shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst;
uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB, uint stride, uint N) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
__shared__ uint s_inf[2 * SAMPLE_STRIDE];
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); if (threadIdx.x == 0) {
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; uint segmentElementsA = stride;
d_SrcKey += segmentBase; uint segmentElementsB = umin(stride, N - segmentBase - stride);
d_SrcVal += segmentBase; uint segmentSamplesA = stride / SAMPLE_STRIDE;
d_DstKey += segmentBase; uint segmentSamplesB = getSampleCount(segmentElementsB);
d_DstVal += segmentBase; uint segmentSamples = segmentSamplesA + segmentSamplesB;
// Set up threadblock-wide parameters startSrcA = d_LimitsA[blockIdx.x];
__shared__ uint startSrcA, lenSrcA, startSrcB, lenSrcB, startDst; startSrcB = d_LimitsB[blockIdx.x];
startDst = startSrcA + startSrcB;
if (threadIdx.x == 0) { uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
uint segmentElementsA = stride; uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
uint segmentElementsB = umin(stride, N - segmentBase - stride); lenSrcA = endSrcA - startSrcA;
uint segmentSamplesA = stride / SAMPLE_STRIDE; lenSrcB = endSrcB - startSrcB;
uint segmentSamplesB = getSampleCount(segmentElementsB); }
uint segmentSamples = segmentSamplesA + segmentSamplesB;
startSrcA = d_LimitsA[blockIdx.x]; s_inf[threadIdx.x + 0] = 1;
startSrcB = d_LimitsB[blockIdx.x]; s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
startDst = startSrcA + startSrcB;
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] // Load input data
: segmentElementsA;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1]
: segmentElementsB;
lenSrcA = endSrcA - startSrcA;
lenSrcB = endSrcB - startSrcB;
}
s_inf[threadIdx.x + 0] = 1;
s_inf[threadIdx.x + SAMPLE_STRIDE] = 1;
// Load input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
s_inf[threadIdx.x] = 0;
}
// Prepare for bitonic merge by inversing the ordering
if (threadIdx.x < lenSrcB) {
s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] =
d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
}
//"Extended" bitonic merge
for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
cg::sync(cta); cg::sync(cta);
uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0], s_val[pos + 0], s_inf[pos + 0],
s_key[pos + stride], s_val[pos + stride],
s_inf[pos + stride], sortDir);
}
// Store sorted data if (threadIdx.x < lenSrcA) {
cg::sync(cta); s_key[threadIdx.x] = d_SrcKey[0 + startSrcA + threadIdx.x];
d_DstKey += startDst; s_val[threadIdx.x] = d_SrcVal[0 + startSrcA + threadIdx.x];
d_DstVal += startDst; s_inf[threadIdx.x] = 0;
}
if (threadIdx.x < lenSrcA) { // Prepare for bitonic merge by inversing the ordering
d_DstKey[threadIdx.x] = s_key[threadIdx.x]; if (threadIdx.x < lenSrcB) {
d_DstVal[threadIdx.x] = s_val[threadIdx.x]; s_key[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcKey[stride + startSrcB + threadIdx.x];
} s_val[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = d_SrcVal[stride + startSrcB + threadIdx.x];
s_inf[2 * SAMPLE_STRIDE - 1 - threadIdx.x] = 0;
}
if (threadIdx.x < lenSrcB) { //"Extended" bitonic merge
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x]; for (uint stride = SAMPLE_STRIDE; stride > 0; stride >>= 1) {
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x]; cg::sync(cta);
} uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
ComparatorExtended<sortDir>(s_key[pos + 0],
s_val[pos + 0],
s_inf[pos + 0],
s_key[pos + stride],
s_val[pos + stride],
s_inf[pos + stride],
sortDir);
}
// Store sorted data
cg::sync(cta);
d_DstKey += startDst;
d_DstVal += startDst;
if (threadIdx.x < lenSrcA) {
d_DstKey[threadIdx.x] = s_key[threadIdx.x];
d_DstVal[threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[lenSrcA + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[lenSrcA + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
} }
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal, extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsA,
uint *d_LimitsB, uint stride, uint *d_LimitsB,
uint N, uint sortDir) { uint stride,
uint lastSegmentElements = N % (2 * stride); uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) { if (sortDir) {
bitonicMergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>( bitonicMergeElementaryIntervalsKernel<1U>
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
N); getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n"); }
} else { else {
bitonicMergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>( bitonicMergeElementaryIntervalsKernel<0U>
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, <<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
N); getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n"); }
}
} }

View File

@ -26,96 +26,94 @@
*/ */
#include <assert.h> #include <assert.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Test driver // Test driver
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal; {
uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal; uint *h_SrcKey, *h_SrcVal, *h_DstKey, *h_DstVal;
StopWatchInterface *hTimer = NULL; uint *d_SrcKey, *d_SrcVal, *d_BufKey, *d_BufVal, *d_DstKey, *d_DstVal;
StopWatchInterface *hTimer = NULL;
const uint N = 4 * 1048576; const uint N = 4 * 1048576;
const uint DIR = 1; const uint DIR = 1;
const uint numValues = 65536; const uint numValues = 65536;
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
int dev = findCudaDevice(argc, (const char **)argv); int dev = findCudaDevice(argc, (const char **)argv);
if (dev == -1) { if (dev == -1) {
return EXIT_FAILURE; return EXIT_FAILURE;
} }
printf("Allocating and initializing host arrays...\n\n"); printf("Allocating and initializing host arrays...\n\n");
sdkCreateTimer(&hTimer); sdkCreateTimer(&hTimer);
h_SrcKey = (uint *)malloc(N * sizeof(uint)); h_SrcKey = (uint *)malloc(N * sizeof(uint));
h_SrcVal = (uint *)malloc(N * sizeof(uint)); h_SrcVal = (uint *)malloc(N * sizeof(uint));
h_DstKey = (uint *)malloc(N * sizeof(uint)); h_DstKey = (uint *)malloc(N * sizeof(uint));
h_DstVal = (uint *)malloc(N * sizeof(uint)); h_DstVal = (uint *)malloc(N * sizeof(uint));
srand(2009); srand(2009);
for (uint i = 0; i < N; i++) { for (uint i = 0; i < N; i++) {
h_SrcKey[i] = rand() % numValues; h_SrcKey[i] = rand() % numValues;
} }
fillValues(h_SrcVal, N); fillValues(h_SrcVal, N);
printf("Allocating and initializing CUDA arrays...\n\n"); printf("Allocating and initializing CUDA arrays...\n\n");
checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_DstKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_DstVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_BufVal, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcKey, N * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint))); checkCudaErrors(cudaMalloc((void **)&d_SrcVal, N * sizeof(uint)));
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice));
cudaMemcpy(d_SrcKey, h_SrcKey, N * sizeof(uint), cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpy(d_SrcVal, h_SrcVal, N * sizeof(uint), cudaMemcpyHostToDevice));
printf("Initializing GPU merge sort...\n"); printf("Initializing GPU merge sort...\n");
initMergeSort(); initMergeSort();
printf("Running GPU merge sort...\n"); printf("Running GPU merge sort...\n");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer); sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer); sdkStartTimer(&hTimer);
mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR); mergeSort(d_DstKey, d_DstVal, d_BufKey, d_BufVal, d_SrcKey, d_SrcVal, N, DIR);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer); sdkStopTimer(&hTimer);
printf("Time: %f ms\n", sdkGetTimerValue(&hTimer)); printf("Time: %f ms\n", sdkGetTimerValue(&hTimer));
printf("Reading back GPU merge sort results...\n"); printf("Reading back GPU merge sort results...\n");
checkCudaErrors( checkCudaErrors(cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost));
cudaMemcpy(h_DstKey, d_DstKey, N * sizeof(uint), cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
checkCudaErrors(
cudaMemcpy(h_DstVal, d_DstVal, N * sizeof(uint), cudaMemcpyDeviceToHost));
printf("Inspecting the results...\n"); printf("Inspecting the results...\n");
uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR); uint keysFlag = validateSortedKeys(h_DstKey, h_SrcKey, 1, N, numValues, DIR);
uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N); uint valuesFlag = validateSortedValues(h_DstKey, h_DstVal, h_SrcKey, 1, N);
printf("Shutting down...\n"); printf("Shutting down...\n");
closeMergeSort(); closeMergeSort();
sdkDeleteTimer(&hTimer); sdkDeleteTimer(&hTimer);
checkCudaErrors(cudaFree(d_SrcVal)); checkCudaErrors(cudaFree(d_SrcVal));
checkCudaErrors(cudaFree(d_SrcKey)); checkCudaErrors(cudaFree(d_SrcKey));
checkCudaErrors(cudaFree(d_BufVal)); checkCudaErrors(cudaFree(d_BufVal));
checkCudaErrors(cudaFree(d_BufKey)); checkCudaErrors(cudaFree(d_BufKey));
checkCudaErrors(cudaFree(d_DstVal)); checkCudaErrors(cudaFree(d_DstVal));
checkCudaErrors(cudaFree(d_DstKey)); checkCudaErrors(cudaFree(d_DstKey));
free(h_DstVal); free(h_DstVal);
free(h_DstKey); free(h_DstKey);
free(h_SrcVal); free(h_SrcVal);
free(h_SrcKey); free(h_SrcKey);
exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE); exit((keysFlag && valuesFlag) ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -39,491 +39,499 @@
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#include <helper_cuda.h> #include <helper_cuda.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Helper functions // Helper functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static inline __host__ __device__ uint iDivUp(uint a, uint b) { static inline __host__ __device__ uint iDivUp(uint a, uint b) { return ((a % b) == 0) ? (a / b) : (a / b + 1); }
return ((a % b) == 0) ? (a / b) : (a / b + 1);
}
static inline __host__ __device__ uint getSampleCount(uint dividend) { static inline __host__ __device__ uint getSampleCount(uint dividend) { return iDivUp(dividend, SAMPLE_STRIDE); }
return iDivUp(dividend, SAMPLE_STRIDE);
}
#define W (sizeof(uint) * 8) #define W (sizeof(uint) * 8)
static inline __device__ uint nextPowerOfTwo(uint x) { static inline __device__ uint nextPowerOfTwo(uint x)
/* {
--x; /*
x |= x >> 1; --x;
x |= x >> 2; x |= x >> 1;
x |= x >> 4; x |= x >> 2;
x |= x >> 8; x |= x >> 4;
x |= x >> 16; x |= x >> 8;
return ++x; x |= x >> 16;
*/ return ++x;
return 1U << (W - __clz(x - 1)); */
return 1U << (W - __clz(x - 1));
} }
template <uint sortDir> template <uint sortDir> static inline __device__ uint binarySearchInclusive(uint val, uint *data, uint L, uint stride)
static inline __device__ uint binarySearchInclusive(uint val, uint *data, {
uint L, uint stride) { if (L == 0) {
if (L == 0) { return 0;
return 0;
}
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) ||
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
} }
}
return pos; uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
}
}
return pos;
} }
template <uint sortDir> template <uint sortDir> static inline __device__ uint binarySearchExclusive(uint val, uint *data, uint L, uint stride)
static inline __device__ uint binarySearchExclusive(uint val, uint *data, {
uint L, uint stride) { if (L == 0) {
if (L == 0) { return 0;
return 0;
}
uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) ||
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
} }
}
return pos; uint pos = 0;
for (; stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
}
}
return pos;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Bottom-level merge sort (binary search-based) // Bottom-level merge sort (binary search-based)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <uint sortDir> template <uint sortDir>
__global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, __global__ void mergeSortSharedKernel(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, uint *d_SrcVal, uint arrayLength)
uint *d_SrcKey, uint *d_SrcVal, {
uint arrayLength) { // Handle to thread block group
// Handle to thread block group cg::thread_block cta = cg::this_thread_block();
cg::thread_block cta = cg::this_thread_block(); __shared__ uint s_key[SHARED_SIZE_LIMIT];
__shared__ uint s_key[SHARED_SIZE_LIMIT]; __shared__ uint s_val[SHARED_SIZE_LIMIT];
__shared__ uint s_val[SHARED_SIZE_LIMIT];
d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_SrcKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_SrcVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_DstKey += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; d_DstVal += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
s_key[threadIdx.x + 0] = d_SrcKey[0]; s_key[threadIdx.x + 0] = d_SrcKey[0];
s_val[threadIdx.x + 0] = d_SrcVal[0]; s_val[threadIdx.x + 0] = d_SrcVal[0];
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcKey[(SHARED_SIZE_LIMIT / 2)];
d_SrcKey[(SHARED_SIZE_LIMIT / 2)]; s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] =
d_SrcVal[(SHARED_SIZE_LIMIT / 2)];
for (uint stride = 1; stride < arrayLength; stride <<= 1) { for (uint stride = 1; stride < arrayLength; stride <<= 1) {
uint lPos = threadIdx.x & (stride - 1); uint lPos = threadIdx.x & (stride - 1);
uint *baseKey = s_key + 2 * (threadIdx.x - lPos); uint *baseKey = s_key + 2 * (threadIdx.x - lPos);
uint *baseVal = s_val + 2 * (threadIdx.x - lPos); uint *baseVal = s_val + 2 * (threadIdx.x - lPos);
cg::sync(cta);
uint keyA = baseKey[lPos + 0];
uint valA = baseVal[lPos + 0];
uint keyB = baseKey[lPos + stride];
uint valB = baseVal[lPos + stride];
uint posA = binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) + lPos;
uint posB = binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) + lPos;
cg::sync(cta);
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
cg::sync(cta); cg::sync(cta);
uint keyA = baseKey[lPos + 0]; d_DstKey[0] = s_key[threadIdx.x + 0];
uint valA = baseVal[lPos + 0]; d_DstVal[0] = s_val[threadIdx.x + 0];
uint keyB = baseKey[lPos + stride]; d_DstKey[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
uint valB = baseVal[lPos + stride]; d_DstVal[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
uint posA =
binarySearchExclusive<sortDir>(keyA, baseKey + stride, stride, stride) +
lPos;
uint posB =
binarySearchInclusive<sortDir>(keyB, baseKey + 0, stride, stride) +
lPos;
cg::sync(cta);
baseKey[posA] = keyA;
baseVal[posA] = valA;
baseKey[posB] = keyB;
baseVal[posB] = valB;
}
cg::sync(cta);
d_DstKey[0] = s_key[threadIdx.x + 0];
d_DstVal[0] = s_val[threadIdx.x + 0];
d_DstKey[(SHARED_SIZE_LIMIT / 2)] =
s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
d_DstVal[(SHARED_SIZE_LIMIT / 2)] =
s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
} }
static void mergeSortShared(uint *d_DstKey, uint *d_DstVal, uint *d_SrcKey, static void mergeSortShared(uint *d_DstKey,
uint *d_SrcVal, uint batchSize, uint arrayLength, uint *d_DstVal,
uint sortDir) { uint *d_SrcKey,
if (arrayLength < 2) { uint *d_SrcVal,
return; uint batchSize,
} uint arrayLength,
uint sortDir)
{
if (arrayLength < 2) {
return;
}
assert(SHARED_SIZE_LIMIT % arrayLength == 0); assert(SHARED_SIZE_LIMIT % arrayLength == 0);
assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0); assert(((batchSize * arrayLength) % SHARED_SIZE_LIMIT) == 0);
uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT; uint blockCount = batchSize * arrayLength / SHARED_SIZE_LIMIT;
uint threadCount = SHARED_SIZE_LIMIT / 2; uint threadCount = SHARED_SIZE_LIMIT / 2;
if (sortDir) { if (sortDir) {
mergeSortSharedKernel<1U><<<blockCount, threadCount>>>( mergeSortSharedKernel<1U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); getLastCudaError("mergeSortShared<1><<<>>> failed\n");
getLastCudaError("mergeSortShared<1><<<>>> failed\n"); }
} else { else {
mergeSortSharedKernel<0U><<<blockCount, threadCount>>>( mergeSortSharedKernel<0U><<<blockCount, threadCount>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength);
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, arrayLength); getLastCudaError("mergeSortShared<0><<<>>> failed\n");
getLastCudaError("mergeSortShared<0><<<>>> failed\n"); }
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 1: generate sample ranks // Merge step 1: generate sample ranks
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <uint sortDir> template <uint sortDir>
__global__ void generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, __global__ void
uint *d_SrcKey, uint stride, uint N, generateSampleRanksKernel(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint threadCount)
uint threadCount) { {
uint pos = blockIdx.x * blockDim.x + threadIdx.x; uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) { if (pos >= threadCount) {
return; return;
} }
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_SrcKey += segmentBase; d_SrcKey += segmentBase;
d_RanksA += segmentBase / SAMPLE_STRIDE; d_RanksA += segmentBase / SAMPLE_STRIDE;
d_RanksB += segmentBase / SAMPLE_STRIDE; d_RanksB += segmentBase / SAMPLE_STRIDE;
const uint segmentElementsA = stride; const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride); const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA); const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB); const uint segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA) { if (i < segmentSamplesA) {
d_RanksA[i] = i * SAMPLE_STRIDE; d_RanksA[i] = i * SAMPLE_STRIDE;
d_RanksB[i] = binarySearchExclusive<sortDir>( d_RanksB[i] = binarySearchExclusive<sortDir>(
d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, d_SrcKey[i * SAMPLE_STRIDE], d_SrcKey + stride, segmentElementsB, nextPowerOfTwo(segmentElementsB));
nextPowerOfTwo(segmentElementsB)); }
}
if (i < segmentSamplesB) { if (i < segmentSamplesB) {
d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE; d_RanksB[(stride / SAMPLE_STRIDE) + i] = i * SAMPLE_STRIDE;
d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>( d_RanksA[(stride / SAMPLE_STRIDE) + i] = binarySearchInclusive<sortDir>(
d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, d_SrcKey[stride + i * SAMPLE_STRIDE], d_SrcKey + 0, segmentElementsA, nextPowerOfTwo(segmentElementsA));
nextPowerOfTwo(segmentElementsA)); }
}
} }
static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, static void generateSampleRanks(uint *d_RanksA, uint *d_RanksB, uint *d_SrcKey, uint stride, uint N, uint sortDir)
uint stride, uint N, uint sortDir) { {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint threadCount = uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
if (sortDir) { if (sortDir) {
generateSampleRanksKernel<1U><<<iDivUp(threadCount, 256), 256>>>( generateSampleRanksKernel<1U>
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n"); getLastCudaError("generateSampleRanksKernel<1U><<<>>> failed\n");
} else { }
generateSampleRanksKernel<0U><<<iDivUp(threadCount, 256), 256>>>( else {
d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount); generateSampleRanksKernel<0U>
getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n"); <<<iDivUp(threadCount, 256), 256>>>(d_RanksA, d_RanksB, d_SrcKey, stride, N, threadCount);
} getLastCudaError("generateSampleRanksKernel<0U><<<>>> failed\n");
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 2: generate sample ranks and indices // Merge step 2: generate sample ranks and indices
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, __global__ void mergeRanksAndIndicesKernel(uint *d_Limits, uint *d_Ranks, uint stride, uint N, uint threadCount)
uint stride, uint N, {
uint threadCount) { uint pos = blockIdx.x * blockDim.x + threadIdx.x;
uint pos = blockIdx.x * blockDim.x + threadIdx.x;
if (pos >= threadCount) { if (pos >= threadCount) {
return; return;
} }
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
d_Ranks += (pos - i) * 2; d_Ranks += (pos - i) * 2;
d_Limits += (pos - i) * 2; d_Limits += (pos - i) * 2;
const uint segmentElementsA = stride; const uint segmentElementsA = stride;
const uint segmentElementsB = umin(stride, N - segmentBase - stride); const uint segmentElementsB = umin(stride, N - segmentBase - stride);
const uint segmentSamplesA = getSampleCount(segmentElementsA); const uint segmentSamplesA = getSampleCount(segmentElementsA);
const uint segmentSamplesB = getSampleCount(segmentElementsB); const uint segmentSamplesB = getSampleCount(segmentElementsB);
if (i < segmentSamplesA) { if (i < segmentSamplesA) {
uint dstPos = binarySearchExclusive<1U>( uint dstPos = binarySearchExclusive<1U>(
d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, d_Ranks[i], d_Ranks + segmentSamplesA, segmentSamplesB, nextPowerOfTwo(segmentSamplesB))
nextPowerOfTwo(segmentSamplesB)) + + i;
i; d_Limits[dstPos] = d_Ranks[i];
d_Limits[dstPos] = d_Ranks[i]; }
}
if (i < segmentSamplesB) { if (i < segmentSamplesB) {
uint dstPos = binarySearchInclusive<1U>(d_Ranks[segmentSamplesA + i], uint dstPos = binarySearchInclusive<1U>(
d_Ranks, segmentSamplesA, d_Ranks[segmentSamplesA + i], d_Ranks, segmentSamplesA, nextPowerOfTwo(segmentSamplesA))
nextPowerOfTwo(segmentSamplesA)) + + i;
i; d_Limits[dstPos] = d_Ranks[segmentSamplesA + i];
d_Limits[dstPos] = d_Ranks[segmentSamplesA + i]; }
}
} }
static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, static void mergeRanksAndIndices(uint *d_LimitsA, uint *d_LimitsB, uint *d_RanksA, uint *d_RanksB, uint stride, uint N)
uint *d_RanksA, uint *d_RanksB, uint stride, {
uint N) { uint lastSegmentElements = N % (2 * stride);
uint lastSegmentElements = N % (2 * stride); uint threadCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
uint threadCount = : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
(lastSegmentElements > stride)
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>( mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsA, d_RanksA, stride, N, threadCount);
d_LimitsA, d_RanksA, stride, N, threadCount); getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
getLastCudaError("mergeRanksAndIndicesKernel(A)<<<>>> failed\n");
mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>( mergeRanksAndIndicesKernel<<<iDivUp(threadCount, 256), 256>>>(d_LimitsB, d_RanksB, stride, N, threadCount);
d_LimitsB, d_RanksB, stride, N, threadCount); getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
getLastCudaError("mergeRanksAndIndicesKernel(B)<<<>>> failed\n");
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals // Merge step 3: merge elementary intervals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <uint sortDir> template <uint sortDir>
inline __device__ void merge(uint *dstKey, uint *dstVal, uint *srcAKey, inline __device__ void merge(uint *dstKey,
uint *srcAVal, uint *srcBKey, uint *srcBVal, uint *dstVal,
uint lenA, uint nPowTwoLenA, uint lenB, uint *srcAKey,
uint nPowTwoLenB, cg::thread_block cta) { uint *srcAVal,
uint keyA, valA, keyB, valB, dstPosA, dstPosB; uint *srcBKey,
uint *srcBVal,
uint lenA,
uint nPowTwoLenA,
uint lenB,
uint nPowTwoLenB,
cg::thread_block cta)
{
uint keyA, valA, keyB, valB, dstPosA, dstPosB;
if (threadIdx.x < lenA) { if (threadIdx.x < lenA) {
keyA = srcAKey[threadIdx.x]; keyA = srcAKey[threadIdx.x];
valA = srcAVal[threadIdx.x]; valA = srcAVal[threadIdx.x];
dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + dstPosA = binarySearchExclusive<sortDir>(keyA, srcBKey, lenB, nPowTwoLenB) + threadIdx.x;
threadIdx.x; }
}
if (threadIdx.x < lenB) { if (threadIdx.x < lenB) {
keyB = srcBKey[threadIdx.x]; keyB = srcBKey[threadIdx.x];
valB = srcBVal[threadIdx.x]; valB = srcBVal[threadIdx.x];
dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + dstPosB = binarySearchInclusive<sortDir>(keyB, srcAKey, lenA, nPowTwoLenA) + threadIdx.x;
threadIdx.x; }
}
cg::sync(cta); cg::sync(cta);
if (threadIdx.x < lenA) { if (threadIdx.x < lenA) {
dstKey[dstPosA] = keyA; dstKey[dstPosA] = keyA;
dstVal[dstPosA] = valA; dstVal[dstPosA] = valA;
} }
if (threadIdx.x < lenB) { if (threadIdx.x < lenB) {
dstKey[dstPosB] = keyB; dstKey[dstPosB] = keyB;
dstVal[dstPosB] = valB; dstVal[dstPosB] = valB;
} }
} }
template <uint sortDir> template <uint sortDir>
__global__ void mergeElementaryIntervalsKernel(uint *d_DstKey, uint *d_DstVal, __global__ void mergeElementaryIntervalsKernel(uint *d_DstKey,
uint *d_SrcKey, uint *d_SrcVal, uint *d_DstVal,
uint *d_LimitsA, uint *d_LimitsB, uint *d_SrcKey,
uint stride, uint N) { uint *d_SrcVal,
// Handle to thread block group uint *d_LimitsA,
cg::thread_block cta = cg::this_thread_block(); uint *d_LimitsB,
__shared__ uint s_key[2 * SAMPLE_STRIDE]; uint stride,
__shared__ uint s_val[2 * SAMPLE_STRIDE]; uint N)
{
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
__shared__ uint s_key[2 * SAMPLE_STRIDE];
__shared__ uint s_val[2 * SAMPLE_STRIDE];
const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1); const uint intervalI = blockIdx.x & ((2 * stride) / SAMPLE_STRIDE - 1);
const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE; const uint segmentBase = (blockIdx.x - intervalI) * SAMPLE_STRIDE;
d_SrcKey += segmentBase; d_SrcKey += segmentBase;
d_SrcVal += segmentBase; d_SrcVal += segmentBase;
d_DstKey += segmentBase; d_DstKey += segmentBase;
d_DstVal += segmentBase; d_DstVal += segmentBase;
// Set up threadblock-wide parameters // Set up threadblock-wide parameters
__shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB; __shared__ uint startSrcA, startSrcB, lenSrcA, lenSrcB, startDstA, startDstB;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
uint segmentElementsA = stride; uint segmentElementsA = stride;
uint segmentElementsB = umin(stride, N - segmentBase - stride); uint segmentElementsB = umin(stride, N - segmentBase - stride);
uint segmentSamplesA = getSampleCount(segmentElementsA); uint segmentSamplesA = getSampleCount(segmentElementsA);
uint segmentSamplesB = getSampleCount(segmentElementsB); uint segmentSamplesB = getSampleCount(segmentElementsB);
uint segmentSamples = segmentSamplesA + segmentSamplesB; uint segmentSamples = segmentSamplesA + segmentSamplesB;
startSrcA = d_LimitsA[blockIdx.x]; startSrcA = d_LimitsA[blockIdx.x];
startSrcB = d_LimitsB[blockIdx.x]; startSrcB = d_LimitsB[blockIdx.x];
uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] uint endSrcA = (intervalI + 1 < segmentSamples) ? d_LimitsA[blockIdx.x + 1] : segmentElementsA;
: segmentElementsA; uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] : segmentElementsB;
uint endSrcB = (intervalI + 1 < segmentSamples) ? d_LimitsB[blockIdx.x + 1] lenSrcA = endSrcA - startSrcA;
: segmentElementsB; lenSrcB = endSrcB - startSrcB;
lenSrcA = endSrcA - startSrcA; startDstA = startSrcA + startSrcB;
lenSrcB = endSrcB - startSrcB; startDstB = startDstA + lenSrcA;
startDstA = startSrcA + startSrcB;
startDstB = startDstA + lenSrcA;
}
// Load main input data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] =
d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] =
d_SrcVal[stride + startSrcB + threadIdx.x];
}
// Merge data in shared memory
cg::sync(cta);
merge<sortDir>(s_key, s_val, s_key + 0, s_val + 0, s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE, lenSrcA, SAMPLE_STRIDE, lenSrcB,
SAMPLE_STRIDE, cta);
// Store merged data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
static void mergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA, uint *d_LimitsB,
uint stride, uint N, uint sortDir) {
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride)
? getSampleCount(N)
: (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
mergeElementaryIntervalsKernel<1U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
} else {
mergeElementaryIntervalsKernel<0U><<<mergePairs, SAMPLE_STRIDE>>>(
d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride,
N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}
extern "C" void bitonicSortShared(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint batchSize, uint arrayLength,
uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey, uint *d_DstVal,
uint *d_SrcKey, uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB, uint stride,
uint N, uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void) {
checkCudaErrors(
cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(
cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}
extern "C" void closeMergeSort(void) {
checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA));
}
extern "C" void mergeSort(uint *d_DstKey, uint *d_DstVal, uint *d_BufKey,
uint *d_BufVal, uint *d_SrcKey, uint *d_SrcVal,
uint N, uint sortDir) {
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
uint *ikey, *ival, *okey, *oval;
if (stageCount & 1) {
ikey = d_BufKey;
ival = d_BufVal;
okey = d_DstKey;
oval = d_DstVal;
} else {
ikey = d_DstKey;
ival = d_DstVal;
okey = d_BufKey;
oval = d_BufVal;
}
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT,
SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB,
stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
checkCudaErrors(cudaMemcpy(
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint), cudaMemcpyDeviceToDevice));
} }
uint *t; // Load main input data
t = ikey; cg::sync(cta);
ikey = okey;
okey = t; if (threadIdx.x < lenSrcA) {
t = ival; s_key[threadIdx.x + 0] = d_SrcKey[0 + startSrcA + threadIdx.x];
ival = oval; s_val[threadIdx.x + 0] = d_SrcVal[0 + startSrcA + threadIdx.x];
oval = t; }
}
if (threadIdx.x < lenSrcB) {
s_key[threadIdx.x + SAMPLE_STRIDE] = d_SrcKey[stride + startSrcB + threadIdx.x];
s_val[threadIdx.x + SAMPLE_STRIDE] = d_SrcVal[stride + startSrcB + threadIdx.x];
}
// Merge data in shared memory
cg::sync(cta);
merge<sortDir>(s_key,
s_val,
s_key + 0,
s_val + 0,
s_key + SAMPLE_STRIDE,
s_val + SAMPLE_STRIDE,
lenSrcA,
SAMPLE_STRIDE,
lenSrcB,
SAMPLE_STRIDE,
cta);
// Store merged data
cg::sync(cta);
if (threadIdx.x < lenSrcA) {
d_DstKey[startDstA + threadIdx.x] = s_key[threadIdx.x];
d_DstVal[startDstA + threadIdx.x] = s_val[threadIdx.x];
}
if (threadIdx.x < lenSrcB) {
d_DstKey[startDstB + threadIdx.x] = s_key[lenSrcA + threadIdx.x];
d_DstVal[startDstB + threadIdx.x] = s_val[lenSrcA + threadIdx.x];
}
}
static void mergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
if (sortDir) {
mergeElementaryIntervalsKernel<1U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<1> failed\n");
}
else {
mergeElementaryIntervalsKernel<0U>
<<<mergePairs, SAMPLE_STRIDE>>>(d_DstKey, d_DstVal, d_SrcKey, d_SrcVal, d_LimitsA, d_LimitsB, stride, N);
getLastCudaError("mergeElementaryIntervalsKernel<0> failed\n");
}
}
extern "C" void bitonicSortShared(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint batchSize,
uint arrayLength,
uint sortDir);
extern "C" void bitonicMergeElementaryIntervals(uint *d_DstKey,
uint *d_DstVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint *d_LimitsA,
uint *d_LimitsB,
uint stride,
uint N,
uint sortDir);
static uint *d_RanksA, *d_RanksB, *d_LimitsA, *d_LimitsB;
static const uint MAX_SAMPLE_COUNT = 32768;
extern "C" void initMergeSort(void)
{
checkCudaErrors(cudaMalloc((void **)&d_RanksA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_RanksB, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_LimitsA, MAX_SAMPLE_COUNT * sizeof(uint)));
checkCudaErrors(cudaMalloc((void **)&d_LimitsB, MAX_SAMPLE_COUNT * sizeof(uint)));
}
extern "C" void closeMergeSort(void)
{
checkCudaErrors(cudaFree(d_RanksA));
checkCudaErrors(cudaFree(d_RanksB));
checkCudaErrors(cudaFree(d_LimitsB));
checkCudaErrors(cudaFree(d_LimitsA));
}
extern "C" void mergeSort(uint *d_DstKey,
uint *d_DstVal,
uint *d_BufKey,
uint *d_BufVal,
uint *d_SrcKey,
uint *d_SrcVal,
uint N,
uint sortDir)
{
uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
;
uint *ikey, *ival, *okey, *oval;
if (stageCount & 1) {
ikey = d_BufKey;
ival = d_BufVal;
okey = d_DstKey;
oval = d_DstVal;
}
else {
ikey = d_DstKey;
ival = d_DstVal;
okey = d_BufKey;
oval = d_BufVal;
}
assert(N <= (SAMPLE_STRIDE * MAX_SAMPLE_COUNT));
assert(N % SHARED_SIZE_LIMIT == 0);
mergeSortShared(ikey, ival, d_SrcKey, d_SrcVal, N / SHARED_SIZE_LIMIT, SHARED_SIZE_LIMIT, sortDir);
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(d_RanksA, d_RanksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(d_LimitsA, d_LimitsB, d_RanksA, d_RanksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, d_LimitsA, d_LimitsB, stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
checkCudaErrors(cudaMemcpy(okey + (N - lastSegmentElements),
ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
checkCudaErrors(cudaMemcpy(oval + (N - lastSegmentElements),
ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint),
cudaMemcpyDeviceToDevice));
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
} }

View File

@ -31,19 +31,17 @@
typedef unsigned int uint; typedef unsigned int uint;
#define SHARED_SIZE_LIMIT 1024U #define SHARED_SIZE_LIMIT 1024U
#define SAMPLE_STRIDE 128 #define SAMPLE_STRIDE 128
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Extensive sort validation routine // Extensive sort validation routine
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, extern "C" uint
uint arrayLength, uint numValues, validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir);
uint sortDir);
extern "C" void fillValues(uint *val, uint N); extern "C" void fillValues(uint *val, uint N);
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength);
uint batchSize, uint arrayLength);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// CUDA merge sort // CUDA merge sort
@ -52,13 +50,11 @@ extern "C" void initMergeSort(void);
extern "C" void closeMergeSort(void); extern "C" void closeMergeSort(void);
extern "C" void mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, extern "C" void
uint *bufVal, uint *srcKey, uint *srcVal, uint N, mergeSort(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
uint sortDir);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// CPU "emulation" // CPU "emulation"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, extern "C" void
uint *bufVal, uint *srcKey, uint *srcVal, uint N, mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir);
uint sortDir);

View File

@ -29,329 +29,335 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Helper functions // Helper functions
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void checkOrder(uint *data, uint N, uint sortDir) { static void checkOrder(uint *data, uint N, uint sortDir)
if (N <= 1) { {
return; if (N <= 1) {
} return;
for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) ||
(!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE);
} }
for (uint i = 0; i < N - 1; i++)
if ((sortDir && (data[i] > data[i + 1])) || (!sortDir && (data[i] < data[i + 1]))) {
fprintf(stderr, "checkOrder() failed!!!\n");
exit(EXIT_FAILURE);
}
} }
static uint umin(uint a, uint b) { return (a <= b) ? a : b; } static uint umin(uint a, uint b) { return (a <= b) ? a : b; }
static uint getSampleCount(uint dividend) { static uint getSampleCount(uint dividend)
return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) {
: (dividend / SAMPLE_STRIDE); return ((dividend % SAMPLE_STRIDE) != 0) ? (dividend / SAMPLE_STRIDE + 1) : (dividend / SAMPLE_STRIDE);
} }
static uint nextPowerOfTwo(uint x) { static uint nextPowerOfTwo(uint x)
--x; {
x |= x >> 1; --x;
x |= x >> 2; x |= x >> 1;
x |= x >> 4; x |= x >> 2;
x |= x >> 8; x |= x >> 4;
x |= x >> 16; x |= x >> 8;
return ++x; x |= x >> 16;
return ++x;
} }
static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir) { static uint binarySearchInclusive(uint val, uint *data, uint L, uint sortDir)
if (L == 0) { {
return 0; if (L == 0) {
} return 0;
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) ||
(!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
} }
}
return pos; uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] <= val)) || (!sortDir && (data[newPos - 1] >= val))) {
pos = newPos;
}
}
return pos;
} }
static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir) { static uint binarySearchExclusive(uint val, uint *data, uint L, uint sortDir)
if (L == 0) { {
return 0; if (L == 0) {
} return 0;
uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) ||
(!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
} }
}
return pos; uint pos = 0;
for (uint stride = nextPowerOfTwo(L); stride > 0; stride >>= 1) {
uint newPos = umin(pos + stride, L);
if ((sortDir && (data[newPos - 1] < val)) || (!sortDir && (data[newPos - 1] > val))) {
pos = newPos;
}
}
return pos;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 1: find sample ranks in each segment // Merge step 1: find sample ranks in each segment
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, static void generateSampleRanks(uint *ranksA, uint *ranksB, uint *srcKey, uint stride, uint N, uint sortDir)
uint stride, uint N, uint sortDir) { {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint sampleCount = uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) { for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
const uint lenA = stride; const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride); const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE; const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB); const uint nB = getSampleCount(lenB);
if (i < nA) { if (i < nA) {
ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE; ranksA[(segmentBase + 0) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = ranksB[(segmentBase + 0) / SAMPLE_STRIDE + i] = binarySearchExclusive(
binarySearchExclusive(srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey[segmentBase + i * SAMPLE_STRIDE], srcKey + segmentBase + stride, lenB, sortDir);
srcKey + segmentBase + stride, lenB, sortDir); }
if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] = binarySearchInclusive(
srcKey[segmentBase + stride + i * SAMPLE_STRIDE], srcKey + segmentBase, lenA, sortDir);
}
} }
if (i < nB) {
ranksB[(segmentBase + stride) / SAMPLE_STRIDE + i] = i * SAMPLE_STRIDE;
ranksA[(segmentBase + stride) / SAMPLE_STRIDE + i] =
binarySearchInclusive(
srcKey[segmentBase + stride + i * SAMPLE_STRIDE],
srcKey + segmentBase, lenA, sortDir);
}
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 2: merge ranks and indices to derive elementary intervals // Merge step 2: merge ranks and indices to derive elementary intervals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, static void mergeRanksAndIndices(uint *limits, uint *ranks, uint stride, uint N)
uint N) { {
uint lastSegmentElements = N % (2 * stride); uint lastSegmentElements = N % (2 * stride);
uint sampleCount = uint sampleCount = (lastSegmentElements > stride) ? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
(lastSegmentElements > stride) : (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
? (N + 2 * stride - lastSegmentElements) / (2 * SAMPLE_STRIDE)
: (N - lastSegmentElements) / (2 * SAMPLE_STRIDE);
for (uint pos = 0; pos < sampleCount; pos++) { for (uint pos = 0; pos < sampleCount; pos++) {
const uint i = pos & ((stride / SAMPLE_STRIDE) - 1); const uint i = pos & ((stride / SAMPLE_STRIDE) - 1);
const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE); const uint segmentBase = (pos - i) * (2 * SAMPLE_STRIDE);
const uint lenA = stride; const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride); const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE; const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB); const uint nB = getSampleCount(lenB);
if (i < nA) { if (i < nA) {
uint dstPosA = uint dstPosA =
binarySearchExclusive(ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], binarySearchExclusive(
ranks + (segmentBase + stride) / SAMPLE_STRIDE, ranks[(segmentBase + 0) / SAMPLE_STRIDE + i], ranks + (segmentBase + stride) / SAMPLE_STRIDE, nB, 1)
nB, 1) + + i;
i; assert(dstPosA < nA + nB);
assert(dstPosA < nA + nB); limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = }
ranks[(segmentBase + 0) / SAMPLE_STRIDE + i];
if (i < nB) {
uint dstPosA =
binarySearchInclusive(
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i], ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1)
+ i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] = ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
}
} }
if (i < nB) {
uint dstPosA = binarySearchInclusive(
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i],
ranks + (segmentBase + 0) / SAMPLE_STRIDE, nA, 1) +
i;
assert(dstPosA < nA + nB);
limits[(segmentBase / SAMPLE_STRIDE) + dstPosA] =
ranks[(segmentBase + stride) / SAMPLE_STRIDE + i];
}
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE) // Merge step 3: merge elementary intervals (each interval is <= SAMPLE_STRIDE)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void merge(uint *dstKey, uint *dstVal, uint *srcAKey, uint *srcAVal, static void merge(uint *dstKey,
uint *srcBKey, uint *srcBVal, uint lenA, uint lenB, uint *dstVal,
uint sortDir) { uint *srcAKey,
checkOrder(srcAKey, lenA, sortDir); uint *srcAVal,
checkOrder(srcBKey, lenB, sortDir); uint *srcBKey,
uint *srcBVal,
uint lenA,
uint lenB,
uint sortDir)
{
checkOrder(srcAKey, lenA, sortDir);
checkOrder(srcBKey, lenB, sortDir);
for (uint i = 0; i < lenA; i++) { for (uint i = 0; i < lenA; i++) {
uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i; uint dstPos = binarySearchExclusive(srcAKey[i], srcBKey, lenB, sortDir) + i;
assert(dstPos < lenA + lenB); assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcAKey[i]; dstKey[dstPos] = srcAKey[i];
dstVal[dstPos] = srcAVal[i]; dstVal[dstPos] = srcAVal[i];
} }
for (uint i = 0; i < lenB; i++) { for (uint i = 0; i < lenB; i++) {
uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i; uint dstPos = binarySearchInclusive(srcBKey[i], srcAKey, lenA, sortDir) + i;
assert(dstPos < lenA + lenB); assert(dstPos < lenA + lenB);
dstKey[dstPos] = srcBKey[i]; dstKey[dstPos] = srcBKey[i];
dstVal[dstPos] = srcBVal[i]; dstVal[dstPos] = srcBVal[i];
} }
} }
static void mergeElementaryIntervals(uint *dstKey, uint *dstVal, uint *srcKey, static void mergeElementaryIntervals(uint *dstKey,
uint *srcVal, uint *limitsA, uint *limitsB, uint *dstVal,
uint stride, uint N, uint sortDir) { uint *srcKey,
uint lastSegmentElements = N % (2 * stride); uint *srcVal,
uint mergePairs = (lastSegmentElements > stride) uint *limitsA,
? getSampleCount(N) uint *limitsB,
: (N - lastSegmentElements) / SAMPLE_STRIDE; uint stride,
uint N,
uint sortDir)
{
uint lastSegmentElements = N % (2 * stride);
uint mergePairs = (lastSegmentElements > stride) ? getSampleCount(N) : (N - lastSegmentElements) / SAMPLE_STRIDE;
for (uint pos = 0; pos < mergePairs; pos++) { for (uint pos = 0; pos < mergePairs; pos++) {
uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1); uint i = pos & ((2 * stride) / SAMPLE_STRIDE - 1);
uint segmentBase = (pos - i) * SAMPLE_STRIDE; uint segmentBase = (pos - i) * SAMPLE_STRIDE;
const uint lenA = stride; const uint lenA = stride;
const uint lenB = umin(stride, N - segmentBase - stride); const uint lenB = umin(stride, N - segmentBase - stride);
const uint nA = stride / SAMPLE_STRIDE; const uint nA = stride / SAMPLE_STRIDE;
const uint nB = getSampleCount(lenB); const uint nB = getSampleCount(lenB);
const uint n = nA + nB; const uint n = nA + nB;
const uint startPosA = limitsA[pos]; const uint startPosA = limitsA[pos];
const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA; const uint endPosA = (i + 1 < n) ? limitsA[pos + 1] : lenA;
const uint startPosB = limitsB[pos]; const uint startPosB = limitsB[pos];
const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB; const uint endPosB = (i + 1 < n) ? limitsB[pos + 1] : lenB;
const uint startPosDst = startPosA + startPosB; const uint startPosDst = startPosA + startPosB;
assert(startPosA <= endPosA && endPosA <= lenA); assert(startPosA <= endPosA && endPosA <= lenA);
assert(startPosB <= endPosB && endPosB <= lenB); assert(startPosB <= endPosB && endPosB <= lenB);
assert((endPosA - startPosA) <= SAMPLE_STRIDE); assert((endPosA - startPosA) <= SAMPLE_STRIDE);
assert((endPosB - startPosB) <= SAMPLE_STRIDE); assert((endPosB - startPosB) <= SAMPLE_STRIDE);
merge(dstKey + segmentBase + startPosDst, merge(dstKey + segmentBase + startPosDst,
dstVal + segmentBase + startPosDst, dstVal + segmentBase + startPosDst,
(srcKey + segmentBase + 0) + startPosA, (srcKey + segmentBase + 0) + startPosA,
(srcVal + segmentBase + 0) + startPosA, (srcVal + segmentBase + 0) + startPosA,
(srcKey + segmentBase + stride) + startPosB, (srcKey + segmentBase + stride) + startPosB,
(srcVal + segmentBase + stride) + startPosB, endPosA - startPosA, (srcVal + segmentBase + stride) + startPosB,
endPosB - startPosB, sortDir); endPosA - startPosA,
} endPosB - startPosB,
sortDir);
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Retarded bubble sort // Retarded bubble sort
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static void bubbleSort(uint *key, uint *val, uint N, uint sortDir) { static void bubbleSort(uint *key, uint *val, uint N, uint sortDir)
if (N <= 1) { {
return; if (N <= 1) {
} return;
}
for (uint bottom = 0; bottom < N - 1; bottom++) {
uint savePos = bottom; for (uint bottom = 0; bottom < N - 1; bottom++) {
uint saveKey = key[bottom]; uint savePos = bottom;
uint saveKey = key[bottom];
for (uint i = bottom + 1; i < N; i++)
if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) { for (uint i = bottom + 1; i < N; i++)
savePos = i; if ((sortDir && (key[i] < saveKey)) || (!sortDir && (key[i] > saveKey))) {
saveKey = key[i]; savePos = i;
} saveKey = key[i];
}
if (savePos != bottom) {
uint t; if (savePos != bottom) {
t = key[savePos]; uint t;
key[savePos] = key[bottom]; t = key[savePos];
key[bottom] = t; key[savePos] = key[bottom];
t = val[savePos]; key[bottom] = t;
val[savePos] = val[bottom]; t = val[savePos];
val[bottom] = t; val[savePos] = val[bottom];
val[bottom] = t;
}
} }
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Interface function // Interface function
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, extern "C" void
uint *bufVal, uint *srcKey, uint *srcVal, uint N, mergeSortHost(uint *dstKey, uint *dstVal, uint *bufKey, uint *bufVal, uint *srcKey, uint *srcVal, uint N, uint sortDir)
uint sortDir) { {
uint *ikey, *ival, *okey, *oval; uint *ikey, *ival, *okey, *oval;
uint stageCount = 0; uint stageCount = 0;
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++) for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1, stageCount++)
; ;
if (stageCount & 1) { if (stageCount & 1) {
ikey = bufKey; ikey = bufKey;
ival = bufVal; ival = bufVal;
okey = dstKey; okey = dstKey;
oval = dstVal; oval = dstVal;
} else { }
ikey = dstKey; else {
ival = dstVal; ikey = dstKey;
okey = bufKey; ival = dstVal;
oval = bufVal; okey = bufKey;
} oval = bufVal;
printf("Bottom-level sort...\n");
memcpy(ikey, srcKey, N * sizeof(uint));
memcpy(ival, srcVal, N * sizeof(uint));
for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos),
sortDir);
}
printf("Merge...\n");
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(limitsA, ranksA, stride, N);
mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride,
N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
memcpy(okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint));
memcpy(oval + (N - lastSegmentElements), ival + (N - lastSegmentElements),
lastSegmentElements * sizeof(uint));
} }
uint *t; printf("Bottom-level sort...\n");
t = ikey; memcpy(ikey, srcKey, N * sizeof(uint));
ikey = okey; memcpy(ival, srcVal, N * sizeof(uint));
okey = t;
t = ival;
ival = oval;
oval = t;
}
free(limitsB); for (uint pos = 0; pos < N; pos += SHARED_SIZE_LIMIT) {
free(limitsA); bubbleSort(ikey + pos, ival + pos, umin(SHARED_SIZE_LIMIT, N - pos), sortDir);
free(ranksB); }
free(ranksA);
printf("Merge...\n");
uint *ranksA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *ranksB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsA = (uint *)malloc(getSampleCount(N) * sizeof(uint));
uint *limitsB = (uint *)malloc(getSampleCount(N) * sizeof(uint));
memset(ranksA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(ranksB, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsA, 0xFF, getSampleCount(N) * sizeof(uint));
memset(limitsB, 0xFF, getSampleCount(N) * sizeof(uint));
for (uint stride = SHARED_SIZE_LIMIT; stride < N; stride <<= 1) {
uint lastSegmentElements = N % (2 * stride);
// Find sample ranks and prepare for limiters merge
generateSampleRanks(ranksA, ranksB, ikey, stride, N, sortDir);
// Merge ranks and indices
mergeRanksAndIndices(limitsA, ranksA, stride, N);
mergeRanksAndIndices(limitsB, ranksB, stride, N);
// Merge elementary intervals
mergeElementaryIntervals(okey, oval, ikey, ival, limitsA, limitsB, stride, N, sortDir);
if (lastSegmentElements <= stride) {
// Last merge segment consists of a single array which just needs to be
// passed through
memcpy(
okey + (N - lastSegmentElements), ikey + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
memcpy(
oval + (N - lastSegmentElements), ival + (N - lastSegmentElements), lastSegmentElements * sizeof(uint));
}
uint *t;
t = ikey;
ikey = okey;
okey = t;
t = ival;
ival = oval;
oval = t;
}
free(limitsB);
free(limitsA);
free(ranksB);
free(ranksA);
} }

View File

@ -29,104 +29,100 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "mergeSort_common.h" #include "mergeSort_common.h"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Validate sorted keys array (check for integrity and proper order) // Validate sorted keys array (check for integrity and proper order)
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" uint validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, extern "C" uint
uint arrayLength, uint numValues, validateSortedKeys(uint *resKey, uint *srcKey, uint batchSize, uint arrayLength, uint numValues, uint sortDir)
uint sortDir) { {
uint *srcHist; uint *srcHist;
uint *resHist; uint *resHist;
if (arrayLength < 2) { if (arrayLength < 2) {
printf("validateSortedKeys(): arrays too short, exiting...\n"); printf("validateSortedKeys(): arrays too short, exiting...\n");
return 1; return 1;
}
printf("...inspecting keys array: ");
srcHist = (uint *)malloc(numValues * sizeof(uint));
resHist = (uint *)malloc(numValues * sizeof(uint));
int flag = 1;
for (uint j = 0; j < batchSize;
j++, srcKey += arrayLength, resKey += arrayLength) {
// Build histograms for keys arrays
memset(srcHist, 0, numValues * sizeof(uint));
memset(resHist, 0, numValues * sizeof(uint));
for (uint i = 0; i < arrayLength; i++) {
if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
srcHist[srcKey[i]]++;
resHist[resKey[i]]++;
} else {
fprintf(
stderr,
"***Set %u source/result key arrays are not limited properly***\n",
j);
flag = 0;
goto brk;
}
} }
// Compare the histograms printf("...inspecting keys array: ");
for (uint i = 0; i < numValues; i++) srcHist = (uint *)malloc(numValues * sizeof(uint));
if (srcHist[i] != resHist[i]) { resHist = (uint *)malloc(numValues * sizeof(uint));
fprintf(stderr,
"***Set %u source/result keys histograms do not match***\n", j);
flag = 0;
goto brk;
}
// Finally check the ordering int flag = 1;
for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) || for (uint j = 0; j < batchSize; j++, srcKey += arrayLength, resKey += arrayLength) {
(!sortDir && (resKey[i] < resKey[i + 1]))) { // Build histograms for keys arrays
fprintf(stderr, memset(srcHist, 0, numValues * sizeof(uint));
"***Set %u result key array is not ordered properly***\n", j); memset(resHist, 0, numValues * sizeof(uint));
flag = 0;
goto brk; for (uint i = 0; i < arrayLength; i++) {
} if ((srcKey[i] < numValues) && (resKey[i] < numValues)) {
} srcHist[srcKey[i]]++;
resHist[resKey[i]]++;
}
else {
fprintf(stderr, "***Set %u source/result key arrays are not limited properly***\n", j);
flag = 0;
goto brk;
}
}
// Compare the histograms
for (uint i = 0; i < numValues; i++)
if (srcHist[i] != resHist[i]) {
fprintf(stderr, "***Set %u source/result keys histograms do not match***\n", j);
flag = 0;
goto brk;
}
// Finally check the ordering
for (uint i = 0; i < arrayLength - 1; i++)
if ((sortDir && (resKey[i] > resKey[i + 1])) || (!sortDir && (resKey[i] < resKey[i + 1]))) {
fprintf(stderr, "***Set %u result key array is not ordered properly***\n", j);
flag = 0;
goto brk;
}
}
brk: brk:
free(resHist); free(resHist);
free(srcHist); free(srcHist);
if (flag) printf("OK\n"); if (flag)
printf("OK\n");
return flag; return flag;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Value validation / stability check routines // Value validation / stability check routines
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" void fillValues(uint *val, uint N) { extern "C" void fillValues(uint *val, uint N)
for (uint i = 0; i < N; i++) val[i] = i; {
for (uint i = 0; i < N; i++)
val[i] = i;
} }
extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, extern "C" int validateSortedValues(uint *resKey, uint *resVal, uint *srcKey, uint batchSize, uint arrayLength)
uint batchSize, uint arrayLength) { {
int correctFlag = 1, stableFlag = 1; int correctFlag = 1, stableFlag = 1;
printf("...inspecting keys and values array: "); printf("...inspecting keys and values array: ");
for (uint i = 0; i < batchSize; for (uint i = 0; i < batchSize; i++, resKey += arrayLength, resVal += arrayLength) {
i++, resKey += arrayLength, resVal += arrayLength) { for (uint j = 0; j < arrayLength; j++) {
for (uint j = 0; j < arrayLength; j++) { if (resKey[j] != srcKey[resVal[j]])
if (resKey[j] != srcKey[resVal[j]]) correctFlag = 0; correctFlag = 0;
if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && if ((j < arrayLength - 1) && (resKey[j] == resKey[j + 1]) && (resVal[j] > resVal[j + 1]))
(resVal[j] > resVal[j + 1])) stableFlag = 0;
stableFlag = 0; }
} }
}
printf(correctFlag ? "OK\n" : "***corrupted!!!***\n"); printf(correctFlag ? "OK\n" : "***corrupted!!!***\n");
printf(stableFlag ? "...stability property: stable!\n" printf(stableFlag ? "...stability property: stable!\n" : "...stability property: NOT stable\n");
: "...stability property: NOT stable\n");
return correctFlag; return correctFlag;
} }

View File

@ -29,106 +29,105 @@
#include <stdio.h> #include <stdio.h>
// Includes CUDA // Includes CUDA
#include <cuda_runtime.h>
#include <cuda/barrier>
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <cuda/barrier>
#include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions // CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
#if __CUDA_ARCH__ >= 700 #if __CUDA_ARCH__ >= 700
template <bool writeSquareRoot> template <bool writeSquareRoot>
__device__ void reduceBlockData( __device__ void reduceBlockData(cuda::barrier<cuda::thread_scope_block> &barrier,
cuda::barrier<cuda::thread_scope_block> &barrier, cg::thread_block_tile<32> &tile32,
cg::thread_block_tile<32> &tile32, double &threadSum, double *result) { double &threadSum,
extern __shared__ double tmp[]; double *result)
{
#pragma unroll extern __shared__ double tmp[];
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
threadSum += tile32.shfl_down(threadSum, offset);
}
if (tile32.thread_rank() == 0) {
tmp[tile32.meta_group_rank()] = threadSum;
}
auto token = barrier.arrive();
barrier.wait(std::move(token));
// The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size()
? tmp[tile32.thread_rank()]
: 0.0;
#pragma unroll #pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) { for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
beta += tile32.shfl_down(beta, offset); threadSum += tile32.shfl_down(threadSum, offset);
}
if (tile32.thread_rank() == 0) {
tmp[tile32.meta_group_rank()] = threadSum;
} }
if (tile32.thread_rank() == 0) { auto token = barrier.arrive();
if (writeSquareRoot)
*result = sqrt(beta); barrier.wait(std::move(token));
else
*result = beta; // The warp 0 will perform last round of reduction
if (tile32.meta_group_rank() == 0) {
double beta = tile32.thread_rank() < tile32.meta_group_size() ? tmp[tile32.thread_rank()] : 0.0;
#pragma unroll
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
beta += tile32.shfl_down(beta, offset);
}
if (tile32.thread_rank() == 0) {
if (writeSquareRoot)
*result = sqrt(beta);
else
*result = beta;
}
} }
}
} }
#endif #endif
__global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, __global__ void normVecByDotProductAWBarrier(float *vecA, float *vecB, double *partialResults, int size)
double *partialResults, int size) { {
#if __CUDA_ARCH__ >= 700 #if __CUDA_ARCH__ >= 700
#pragma diag_suppress static_var_with_dynamic_init #pragma diag_suppress static_var_with_dynamic_init
cg::thread_block cta = cg::this_thread_block(); cg::thread_block cta = cg::this_thread_block();
cg::grid_group grid = cg::this_grid(); cg::grid_group grid = cg::this_grid();
; ;
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta); cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
__shared__ cuda::barrier<cuda::thread_scope_block> barrier; __shared__ cuda::barrier<cuda::thread_scope_block> barrier;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
init(&barrier, blockDim.x); init(&barrier, blockDim.x);
}
cg::sync(cta);
double threadSum = 0.0;
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
threadSum += (double)(vecA[i] * vecB[i]);
}
// Each thread block performs reduction of partial dotProducts and writes to
// global mem.
reduceBlockData<false>(barrier, tile32, threadSum,
&partialResults[blockIdx.x]);
cg::sync(grid);
// One block performs the final summation of partial dot products
// of all the thread blocks and writes the sqrt of final dot product.
if (blockIdx.x == 0) {
threadSum = 0.0;
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
threadSum += partialResults[i];
} }
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
}
cg::sync(grid); cg::sync(cta);
const double finalValue = partialResults[0]; double threadSum = 0.0;
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
threadSum += (double)(vecA[i] * vecB[i]);
}
// Perform normalization of vecA & vecB. // Each thread block performs reduction of partial dotProducts and writes to
for (int i = grid.thread_rank(); i < size; i += grid.size()) { // global mem.
vecA[i] = (float)vecA[i] / finalValue; reduceBlockData<false>(barrier, tile32, threadSum, &partialResults[blockIdx.x]);
vecB[i] = (float)vecB[i] / finalValue;
} cg::sync(grid);
// One block performs the final summation of partial dot products
// of all the thread blocks and writes the sqrt of final dot product.
if (blockIdx.x == 0) {
threadSum = 0.0;
for (int i = cta.thread_rank(); i < gridDim.x; i += cta.size()) {
threadSum += partialResults[i];
}
reduceBlockData<true>(barrier, tile32, threadSum, &partialResults[0]);
}
cg::sync(grid);
const double finalValue = partialResults[0];
// Perform normalization of vecA & vecB.
for (int i = grid.thread_rank(); i < size; i += grid.size()) {
vecA[i] = (float)vecA[i] / finalValue;
vecB[i] = (float)vecB[i] / finalValue;
}
#endif #endif
} }
@ -137,119 +136,113 @@ int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", argv[0]); {
printf("%s starting...\n", argv[0]);
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
int dev = findCudaDevice(argc, (const char **)argv); int dev = findCudaDevice(argc, (const char **)argv);
int major = 0; int major = 0;
checkCudaErrors( checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
// Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher. // Arrive-Wait Barrier require a GPU of Volta (SM7X) architecture or higher.
if (major < 7) { if (major < 7) {
printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n"); printf("simpleAWBarrier requires SM 7.0 or higher. Exiting...\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
}
int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch,
cudaDevAttrCooperativeLaunch, dev));
if (!supportsCooperativeLaunch) {
printf(
"\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
"Waiving the run\n",
dev);
exit(EXIT_WAIVED);
}
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId) {
float *vecA, *d_vecA;
float *vecB, *d_vecB;
double *d_partialResults;
int size = 10000000;
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
float baseVal = 2.0;
for (int i = 0; i < size; i++) {
vecA[i] = vecB[i] = baseVal;
}
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size,
cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0;
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize(
&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
int numBlocksPerSm = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute(
&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf(
"Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n",
minGridSize, blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB,
(void *)&d_partialResults, (void *)&size};
checkCudaErrors(
cudaLaunchCooperativeKernel((void *)normVecByDotProductAWBarrier, dimGrid,
dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size,
cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
unsigned int matches = 0;
for (int i = 0; i < size; i++) {
if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i);
break;
} else {
matches++;
} }
}
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED"); int supportsCooperativeLaunch = 0;
checkCudaErrors(cudaFree(d_vecA)); checkCudaErrors(cudaDeviceGetAttribute(&supportsCooperativeLaunch, cudaDevAttrCooperativeLaunch, dev));
checkCudaErrors(cudaFree(d_vecB));
checkCudaErrors(cudaFree(d_partialResults));
checkCudaErrors(cudaFreeHost(vecA)); if (!supportsCooperativeLaunch) {
checkCudaErrors(cudaFreeHost(vecB)); printf("\nSelected GPU (%d) does not support Cooperative Kernel Launch, "
return matches == size; "Waiving the run\n",
dev);
exit(EXIT_WAIVED);
}
int testResult = runNormVecByDotProductAWBarrier(argc, argv, dev);
printf("%s completed, returned %s\n", argv[0], testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
int runNormVecByDotProductAWBarrier(int argc, char **argv, int deviceId)
{
float *vecA, *d_vecA;
float *vecB, *d_vecB;
double *d_partialResults;
int size = 10000000;
checkCudaErrors(cudaMallocHost(&vecA, sizeof(float) * size));
checkCudaErrors(cudaMallocHost(&vecB, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecA, sizeof(float) * size));
checkCudaErrors(cudaMalloc(&d_vecB, sizeof(float) * size));
float baseVal = 2.0;
for (int i = 0; i < size; i++) {
vecA[i] = vecB[i] = baseVal;
}
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_vecA, vecA, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_vecB, vecB, sizeof(float) * size, cudaMemcpyHostToDevice, stream));
// Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured.
int minGridSize = 0, blockSize = 0;
checkCudaErrors(
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)normVecByDotProductAWBarrier, 0, size));
int smemSize = ((blockSize / 32) + 1) * sizeof(double);
int numBlocksPerSm = 0;
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocksPerSm, normVecByDotProductAWBarrier, blockSize, smemSize));
int multiProcessorCount = 0;
checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, deviceId));
minGridSize = multiProcessorCount * numBlocksPerSm;
checkCudaErrors(cudaMalloc(&d_partialResults, minGridSize * sizeof(double)));
printf("Launching normVecByDotProductAWBarrier kernel with numBlocks = %d "
"blockSize = %d\n",
minGridSize,
blockSize);
dim3 dimGrid(minGridSize, 1, 1), dimBlock(blockSize, 1, 1);
void *kernelArgs[] = {(void *)&d_vecA, (void *)&d_vecB, (void *)&d_partialResults, (void *)&size};
checkCudaErrors(cudaLaunchCooperativeKernel(
(void *)normVecByDotProductAWBarrier, dimGrid, dimBlock, kernelArgs, smemSize, stream));
checkCudaErrors(cudaMemcpyAsync(vecA, d_vecA, sizeof(float) * size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
float expectedResult = (baseVal / sqrt(size * baseVal * baseVal));
unsigned int matches = 0;
for (int i = 0; i < size; i++) {
if ((vecA[i] - expectedResult) > 0.00001) {
printf("mismatch at i = %d\n", i);
break;
}
else {
matches++;
}
}
printf("Result = %s\n", matches == size ? "PASSED" : "FAILED");
checkCudaErrors(cudaFree(d_vecA));
checkCudaErrors(cudaFree(d_vecB));
checkCudaErrors(cudaFree(d_partialResults));
checkCudaErrors(cudaFreeHost(vecA));
checkCudaErrors(cudaFreeHost(vecB));
return matches == size;
} }

View File

@ -34,17 +34,17 @@
#endif #endif
// Includes, system // Includes, system
#include <stdio.h>
#include <cassert> #include <cassert>
#include <stdio.h>
// Includes CUDA // Includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions // CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
const char *sampleName = "simpleAssert"; const char *sampleName = "simpleAssert";
@ -58,9 +58,10 @@ bool testResult = true;
//! Tests assert function. //! Tests assert function.
//! Thread whose id > N will print assertion failed error message. //! Thread whose id > N will print assertion failed error message.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int N) { __global__ void testKernel(int N)
int gtid = blockIdx.x * blockDim.x + threadIdx.x; {
assert(gtid < N); int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -70,59 +71,60 @@ void runTest(int argc, char **argv);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", sampleName); {
printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
int Nblocks = 2; {
int Nthreads = 32; int Nblocks = 2;
cudaError_t error; int Nthreads = 32;
cudaError_t error;
#ifndef _WIN32 #ifndef _WIN32
utsname OS_System_Type; utsname OS_System_Type;
uname(&OS_System_Type); uname(&OS_System_Type);
printf("OS_System_Type.release = %s\n", OS_System_Type.release); printf("OS_System_Type.release = %s\n", OS_System_Type.release);
if (!strcasecmp(OS_System_Type.sysname, "Darwin")) { if (!strcasecmp(OS_System_Type.sysname, "Darwin")) {
printf("simpleAssert is not current supported on Mac OSX\n\n"); printf("simpleAssert is not current supported on Mac OSX\n\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} else { }
printf("OS Info: <%s>\n\n", OS_System_Type.version); else {
} printf("OS Info: <%s>\n\n", OS_System_Type.version);
}
#endif #endif
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv); findCudaDevice(argc, (const char **)argv);
// Kernel configuration, where a one-dimensional // Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured. // grid and one-dimensional blocks are configured.
dim3 dimGrid(Nblocks); dim3 dimGrid(Nblocks);
dim3 dimBlock(Nthreads); dim3 dimBlock(Nthreads);
printf("Launch kernel to generate assertion failures\n"); printf("Launch kernel to generate assertion failures\n");
testKernel<<<dimGrid, dimBlock>>>(60); testKernel<<<dimGrid, dimBlock>>>(60);
// Synchronize (flushes assert output). // Synchronize (flushes assert output).
printf("\n-- Begin assert output\n\n"); printf("\n-- Begin assert output\n\n");
error = cudaDeviceSynchronize(); error = cudaDeviceSynchronize();
printf("\n-- End assert output\n\n"); printf("\n-- End assert output\n\n");
// Check for errors and failed asserts in asynchronous kernel launch. // Check for errors and failed asserts in asynchronous kernel launch.
if (error == cudaErrorAssert) { if (error == cudaErrorAssert) {
printf( printf("Device assert failed as expected, "
"Device assert failed as expected, " "CUDA error message is: %s\n\n",
"CUDA error message is: %s\n\n", cudaGetErrorString(error));
cudaGetErrorString(error)); }
}
testResult = error == cudaErrorAssert; testResult = error == cudaErrorAssert;
} }

View File

@ -34,15 +34,16 @@
#endif #endif
// Includes, system // Includes, system
#include <stdio.h>
#include <cassert> #include <cassert>
#include <stdio.h>
// Includes CUDA // Includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "nvrtc_helper.h" #include "nvrtc_helper.h"
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
const char *sampleName = "simpleAssert_nvrtc"; const char *sampleName = "simpleAssert_nvrtc";
@ -58,56 +59,63 @@ void runTest(int argc, char **argv);
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", sampleName); {
printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
int Nblocks = 2; {
int Nthreads = 32; int Nblocks = 2;
int Nthreads = 32;
// Kernel configuration, where a one-dimensional // Kernel configuration, where a one-dimensional
// grid and one-dimensional blocks are configured. // grid and one-dimensional blocks are configured.
dim3 dimGrid(Nblocks); dim3 dimGrid(Nblocks);
dim3 dimBlock(Nthreads); dim3 dimBlock(Nthreads);
printf("Launch kernel to generate assertion failures\n"); printf("Launch kernel to generate assertion failures\n");
char *cubin, *kernel_file; char *cubin, *kernel_file;
size_t cubinSize; size_t cubinSize;
kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]); kernel_file = sdkFindFilePath("simpleAssert_kernel.cu", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv); CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr; CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel")); checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
int count = 60; int count = 60;
void *args[] = {(void *)&count}; void *args[] = {(void *)&count};
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuLaunchKernel(kernel_addr,
kernel_addr, dimGrid.x, dimGrid.y, dimGrid.z, /* grid dim */ dimGrid.x,
dimBlock.x, dimBlock.y, dimBlock.z, /* block dim */ dimGrid.y,
0, 0, /* shared mem, stream */ dimGrid.z, /* grid dim */
&args[0], /* arguments */ dimBlock.x,
0)); dimBlock.y,
dimBlock.z, /* block dim */
0,
0, /* shared mem, stream */
&args[0], /* arguments */
0));
// Synchronize (flushes assert output). // Synchronize (flushes assert output).
printf("\n-- Begin assert output\n\n"); printf("\n-- Begin assert output\n\n");
CUresult res = cuCtxSynchronize(); CUresult res = cuCtxSynchronize();
printf("\n-- End assert output\n\n"); printf("\n-- End assert output\n\n");
// Check for errors and failed asserts in asynchronous kernel launch. // Check for errors and failed asserts in asynchronous kernel launch.
if (res == CUDA_ERROR_ASSERT) { if (res == CUDA_ERROR_ASSERT) {
printf("Device assert failed as expected\n"); printf("Device assert failed as expected\n");
} }
testResult = res == CUDA_ERROR_ASSERT; testResult = res == CUDA_ERROR_ASSERT;
} }

View File

@ -32,7 +32,8 @@
//! Thread whose id > N will print assertion failed error message. //! Thread whose id > N will print assertion failed error message.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void testKernel(int N) { extern "C" __global__ void testKernel(int N)
int gtid = blockIdx.x * blockDim.x + threadIdx.x; {
assert(gtid < N); int gtid = blockIdx.x * blockDim.x + threadIdx.x;
assert(gtid < N);
} }

View File

@ -30,10 +30,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -45,10 +45,10 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions // CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
// Includes, kernels // Includes, kernels
#include "simpleAtomicIntrinsics_kernel.cuh" #include "simpleAtomicIntrinsics_kernel.cuh"
@ -68,67 +68,67 @@ extern "C" bool computeGold(int *gpuData, const int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", sampleName); {
printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
cudaStream_t stream; {
// This will pick the best possible CUDA capable device cudaStream_t stream;
findCudaDevice(argc, (const char **)argv); // This will pick the best possible CUDA capable device
findCudaDevice(argc, (const char **)argv);
StopWatchInterface *timer; StopWatchInterface *timer;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
unsigned int numThreads = 256; unsigned int numThreads = 256;
unsigned int numBlocks = 64; unsigned int numBlocks = 64;
unsigned int numData = 11; unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData; unsigned int memSize = sizeof(int) * numData;
// allocate mem for the result on host side // allocate mem for the result on host side
int *hOData; int *hOData;
checkCudaErrors(cudaMallocHost(&hOData, memSize)); checkCudaErrors(cudaMallocHost(&hOData, memSize));
// initialize the memory // initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
// To make the AND and XOR tests generate something other than 0... // To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff; hOData[8] = hOData[10] = 0xff;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// allocate device memory for result // allocate device memory for result
int *dOData; int *dOData;
checkCudaErrors(cudaMalloc((void **)&dOData, memSize)); checkCudaErrors(cudaMalloc((void **)&dOData, memSize));
// copy host memory to device to initialize to zero // copy host memory to device to initialize to zero
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
cudaMemcpyAsync(dOData, hOData, memSize, cudaMemcpyHostToDevice, stream));
// execute the kernel // execute the kernel
testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData); testKernel<<<numBlocks, numThreads, 0, stream>>>(dOData);
// Copy result from device to host // Copy result from device to host
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream));
cudaMemcpyAsync(hOData, dOData, memSize, cudaMemcpyDeviceToHost, stream)); checkCudaErrors(cudaStreamSynchronize(stream));
checkCudaErrors(cudaStreamSynchronize(stream));
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// Compute reference solution // Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks); testResult = computeGold(hOData, numThreads * numBlocks);
// Cleanup memory // Cleanup memory
checkCudaErrors(cudaFreeHost(hOData)); checkCudaErrors(cudaFreeHost(hOData));
checkCudaErrors(cudaFree(dOData)); checkCudaErrors(cudaFree(dOData));
} }

View File

@ -42,141 +42,142 @@ extern "C" int computeGold(int *gpuData, const int len);
//! @param idata input data as provided to device //! @param idata input data as provided to device
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int computeGold(int *gpuData, const int len) { int computeGold(int *gpuData, const int len)
int val = 0; {
int val = 0;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
val += 10; val += 10;
}
if (val != gpuData[0]) {
printf("atomicAdd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
val -= 10;
}
if (val != gpuData[1]) {
printf("atomicSub failed\n");
return false;
}
bool found = false;
for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
} }
}
if (!found) { if (val != gpuData[0]) {
printf("atomicExch failed\n"); printf("atomicAdd failed\n");
return false; return false;
}
val = -(1 << 8);
for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
} }
}
if (!found) { val = 0;
printf("atomicCAS failed\n");
return false;
}
val = 0xff; for (int i = 0; i < len; ++i) {
val -= 10;
}
for (int i = 0; i < len; ++i) { if (val != gpuData[1]) {
// 9th element should be 1 printf("atomicSub failed\n");
val &= (2 * i + 7); return false;
} }
if (val != gpuData[8]) { bool found = false;
printf("atomicAnd failed\n");
return false;
}
val = 0; for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
}
}
for (int i = 0; i < len; ++i) { if (!found) {
// 10th element should be 0xff printf("atomicExch failed\n");
val |= (1 << i); return false;
} }
if (val != gpuData[9]) { val = -(1 << 8);
printf("atomicOr failed\n");
return false;
}
val = 0xff; for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
for (int i = 0; i < len; ++i) { if (val != gpuData[3]) {
// 11th element should be 0xff printf("atomicMax failed\n");
val ^= i; return false;
} }
if (val != gpuData[10]) { val = 1 << 8;
printf("atomicXor failed\n");
return false;
}
return true; for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 9th element should be 1
val &= (2 * i + 7);
}
if (val != gpuData[8]) {
printf("atomicAnd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
if (val != gpuData[9]) {
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != gpuData[10]) {
printf("atomicXor failed\n");
return false;
}
return true;
} }

View File

@ -35,48 +35,49 @@
//! @param g_idata input data in global memory //! @param g_idata input data in global memory
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(int *g_odata) { __global__ void testKernel(int *g_odata)
// access thread id {
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; // access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// Test various atomic instructions // Test various atomic instructions
// Arithmetic atomic instructions // Arithmetic atomic instructions
// Atomic addition // Atomic addition
atomicAdd(&g_odata[0], 10); atomicAdd(&g_odata[0], 10);
// Atomic subtraction (final should be 0) // Atomic subtraction (final should be 0)
atomicSub(&g_odata[1], 10); atomicSub(&g_odata[1], 10);
// Atomic exchange // Atomic exchange
atomicExch(&g_odata[2], tid); atomicExch(&g_odata[2], tid);
// Atomic maximum // Atomic maximum
atomicMax(&g_odata[3], tid); atomicMax(&g_odata[3], tid);
// Atomic minimum // Atomic minimum
atomicMin(&g_odata[4], tid); atomicMin(&g_odata[4], tid);
// Atomic increment (modulo 17+1) // Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17); atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic decrement // Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137); atomicDec((unsigned int *)&g_odata[6], 137);
// Atomic compare-and-swap // Atomic compare-and-swap
atomicCAS(&g_odata[7], tid - 1, tid); atomicCAS(&g_odata[7], tid - 1, tid);
// Bitwise atomic instructions // Bitwise atomic instructions
// Atomic AND // Atomic AND
atomicAnd(&g_odata[8], 2 * tid + 7); atomicAnd(&g_odata[8], 2 * tid + 7);
// Atomic OR // Atomic OR
atomicOr(&g_odata[9], 1 << tid); atomicOr(&g_odata[9], 1 << tid);
// Atomic XOR // Atomic XOR
atomicXor(&g_odata[10], tid); atomicXor(&g_odata[10], tid);
} }
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_ #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_

View File

@ -30,10 +30,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -46,7 +46,7 @@
#include <nvrtc_helper.h> #include <nvrtc_helper.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
const char *sampleName = "simpleAtomicIntrinsics_nvrtc"; const char *sampleName = "simpleAtomicIntrinsics_nvrtc";
@ -64,84 +64,90 @@ extern "C" bool computeGold(int *gpuData, const int len);
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", sampleName); {
printf("%s starting...\n", sampleName);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!");
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
int dev = 0; {
int dev = 0;
char *cubin, *kernel_file; char *cubin, *kernel_file;
size_t cubinSize; size_t cubinSize;
kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]); kernel_file = sdkFindFilePath("simpleAtomicIntrinsics_kernel.cuh", argv[0]);
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv); CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr; CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel")); checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "testKernel"));
StopWatchInterface *timer; StopWatchInterface *timer;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
unsigned int numThreads = 256; unsigned int numThreads = 256;
unsigned int numBlocks = 64; unsigned int numBlocks = 64;
unsigned int numData = 11; unsigned int numData = 11;
unsigned int memSize = sizeof(int) * numData; unsigned int memSize = sizeof(int) * numData;
// allocate mem for the result on host side // allocate mem for the result on host side
int *hOData = (int *)malloc(memSize); int *hOData = (int *)malloc(memSize);
// initialize the memory // initialize the memory
for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; for (unsigned int i = 0; i < numData; i++)
hOData[i] = 0;
// To make the AND and XOR tests generate something other than 0... // To make the AND and XOR tests generate something other than 0...
hOData[8] = hOData[10] = 0xff; hOData[8] = hOData[10] = 0xff;
// allocate device memory for result // allocate device memory for result
CUdeviceptr dOData; CUdeviceptr dOData;
checkCudaErrors(cuMemAlloc(&dOData, memSize)); checkCudaErrors(cuMemAlloc(&dOData, memSize));
checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize)); checkCudaErrors(cuMemcpyHtoD(dOData, hOData, memSize));
// execute the kernel // execute the kernel
dim3 cudaBlockSize(numThreads, 1, 1); dim3 cudaBlockSize(numThreads, 1, 1);
dim3 cudaGridSize(numBlocks, 1, 1); dim3 cudaGridSize(numBlocks, 1, 1);
void *arr[] = {(void *)&dOData}; void *arr[] = {(void *)&dOData};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y, checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.z, /* grid dim */ cudaGridSize.x,
cudaBlockSize.x, cudaBlockSize.y, cudaGridSize.y,
cudaBlockSize.z, /* block dim */ cudaGridSize.z, /* grid dim */
0, 0, /* shared mem, stream */ cudaBlockSize.x,
&arr[0], /* arguments */ cudaBlockSize.y,
0)); cudaBlockSize.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize)); checkCudaErrors(cuMemcpyDtoH(hOData, dOData, memSize));
// Copy result from device to host // Copy result from device to host
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// Compute reference solution // Compute reference solution
testResult = computeGold(hOData, numThreads * numBlocks); testResult = computeGold(hOData, numThreads * numBlocks);
// Cleanup memory // Cleanup memory
free(hOData); free(hOData);
checkCudaErrors(cuMemFree(dOData)); checkCudaErrors(cuMemFree(dOData));
} }

View File

@ -43,139 +43,140 @@ extern "C" int computeGold(int *gpuData, const int len);
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int computeGold(int *gpuData, const int len) { int computeGold(int *gpuData, const int len)
int val = 0; {
int val = 0;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
val += 10; val += 10;
}
if (val != gpuData[0]) {
printf("atomicAdd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
val -= 10;
}
if (val != gpuData[1]) {
printf("atomicSub failed\n");
return false;
}
bool found = false;
for (int i = 0; i < len; ++i) {
// third element should be a member of [0, len)
if (i == gpuData[2]) {
found = true;
break;
} }
}
if (!found) { if (val != gpuData[0]) {
printf("atomicExch failed\n"); printf("atomicAdd failed\n");
return false; return false;
}
val = -(1 << 8);
for (int i = 0; i < len; ++i) {
// fourth element should be len-1
val = max(val, i);
}
if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
} }
}
if (!found) { val = 0;
printf("atomicCAS failed\n");
return false;
}
val = 0xff; for (int i = 0; i < len; ++i) {
for (int i = 0; i < len; ++i) { val -= 10;
// 9th element should be 1 }
val &= (2 * i + 7);
}
if (val != gpuData[8]) { if (val != gpuData[1]) {
printf("atomicAnd failed\n"); printf("atomicSub failed\n");
return false; return false;
} }
val = 0; bool found = false;
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
if (val != gpuData[9]) { for (int i = 0; i < len; ++i) {
printf("atomicOr failed\n"); // third element should be a member of [0, len)
return false; if (i == gpuData[2]) {
} found = true;
break;
}
}
val = 0xff; if (!found) {
printf("atomicExch failed\n");
return false;
}
for (int i = 0; i < len; ++i) { val = -(1 << 8);
// 11th element should be 0xff
val ^= i;
}
if (val != gpuData[10]) { for (int i = 0; i < len; ++i) {
printf("atomicXor failed\n"); // fourth element should be len-1
return false; val = max(val, i);
} }
return true; if (val != gpuData[3]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != gpuData[4]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != gpuData[5]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != gpuData[6]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// eighth element should be a member of [0, len)
if (i == gpuData[7]) {
found = true;
break;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 9th element should be 1
val &= (2 * i + 7);
}
if (val != gpuData[8]) {
printf("atomicAnd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 10th element should be 0xff
val |= (1 << i);
}
if (val != gpuData[9]) {
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != gpuData[10]) {
printf("atomicXor failed\n");
return false;
}
return true;
} }

View File

@ -36,45 +36,46 @@
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void testKernel(int *g_odata) { extern "C" __global__ void testKernel(int *g_odata)
// access thread id {
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; // access thread id
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// Test various atomic instructions // Test various atomic instructions
// Arithmetic atomic instructions // Arithmetic atomic instructions
// Atomic addition // Atomic addition
atomicAdd(&g_odata[0], 10); atomicAdd(&g_odata[0], 10);
// Atomic subtraction (final should be 0) // Atomic subtraction (final should be 0)
atomicSub(&g_odata[1], 10); atomicSub(&g_odata[1], 10);
// Atomic exchange // Atomic exchange
atomicExch(&g_odata[2], tid); atomicExch(&g_odata[2], tid);
// Atomic maximum // Atomic maximum
atomicMax(&g_odata[3], tid); atomicMax(&g_odata[3], tid);
// Atomic minimum // Atomic minimum
atomicMin(&g_odata[4], tid); atomicMin(&g_odata[4], tid);
// Atomic increment (modulo 17+1) // Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17); atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic decrement // Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137); atomicDec((unsigned int *)&g_odata[6], 137);
// Atomic compare-and-swap // Atomic compare-and-swap
atomicCAS(&g_odata[7], tid - 1, tid); atomicCAS(&g_odata[7], tid - 1, tid);
// Bitwise atomic instructions // Bitwise atomic instructions
// Atomic AND // Atomic AND
atomicAnd(&g_odata[8], 2 * tid + 7); atomicAnd(&g_odata[8], 2 * tid + 7);
// Atomic OR // Atomic OR
atomicOr(&g_odata[9], 1 << tid); atomicOr(&g_odata[9], 1 << tid);
// Atomic XOR // Atomic XOR
atomicXor(&g_odata[10], tid); atomicXor(&g_odata[10], tid);
} }
#endif // #ifndef _SIMPLEATOMICS_KERNEL_H_ #endif // #ifndef _SIMPLEATOMICS_KERNEL_H_

View File

@ -26,30 +26,31 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA // includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper functions for SDK examples #include <helper_functions.h> // helper functions for SDK examples
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// declaration, forward // declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
cudaAccessPolicyWindow initAccessPolicyWindow(void) { cudaAccessPolicyWindow initAccessPolicyWindow(void)
cudaAccessPolicyWindow accessPolicyWindow = {0}; {
accessPolicyWindow.base_ptr = (void *)0; cudaAccessPolicyWindow accessPolicyWindow = {0};
accessPolicyWindow.num_bytes = 0; accessPolicyWindow.base_ptr = (void *)0;
accessPolicyWindow.hitRatio = 0.f; accessPolicyWindow.num_bytes = 0;
accessPolicyWindow.hitProp = cudaAccessPropertyNormal; accessPolicyWindow.hitRatio = 0.f;
accessPolicyWindow.missProp = cudaAccessPropertyStreaming; accessPolicyWindow.hitProp = cudaAccessPropertyNormal;
return accessPolicyWindow; accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
return accessPolicyWindow;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -60,35 +61,35 @@ cudaAccessPolicyWindow initAccessPolicyWindow(void) {
//! @param bigDataSize input bigData size //! @param bigDataSize input bigData size
//! @param hitcount how many data access are done within block //! @param hitcount how many data access are done within block
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, static __global__ void kernCacheSegmentTest(int *data, int dataSize, int *trash, int bigDataSize, int hitCount)
int bigDataSize, int hitCount) { {
__shared__ unsigned int hit; __shared__ unsigned int hit;
int row = blockIdx.y * blockDim.y + threadIdx.y; int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x; int col = blockIdx.x * blockDim.x + threadIdx.x;
int tID = row * blockDim.y + col; int tID = row * blockDim.y + col;
uint32_t psRand = tID; uint32_t psRand = tID;
atomicExch(&hit, 0); atomicExch(&hit, 0);
__syncthreads(); __syncthreads();
while (hit < hitCount) { while (hit < hitCount) {
psRand ^= psRand << 13; psRand ^= psRand << 13;
psRand ^= psRand >> 17; psRand ^= psRand >> 17;
psRand ^= psRand << 5; psRand ^= psRand << 5;
int idx = tID - psRand; int idx = tID - psRand;
if (idx < 0) { if (idx < 0) {
idx = -idx; idx = -idx;
}
if ((tID % 2) == 0) {
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
}
else {
trash[psRand % bigDataSize] = trash[psRand % bigDataSize] + trash[idx % bigDataSize];
}
atomicAdd(&hit, 1);
} }
if ((tID % 2) == 0) {
data[psRand % dataSize] = data[psRand % dataSize] + data[idx % dataSize];
} else {
trash[psRand % bigDataSize] =
trash[psRand % bigDataSize] + trash[idx % bigDataSize];
}
atomicAdd(&hit, 1);
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
@ -98,117 +99,110 @@ int main(int argc, char **argv) { runTest(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
bool bTestResult = true; {
cudaAccessPolicyWindow accessPolicyWindow; bool bTestResult = true;
cudaDeviceProp deviceProp; cudaAccessPolicyWindow accessPolicyWindow;
cudaStreamAttrValue streamAttrValue; cudaDeviceProp deviceProp;
cudaStream_t stream; cudaStreamAttrValue streamAttrValue;
cudaStreamAttrID streamAttrID; cudaStream_t stream;
dim3 threads(32, 32); cudaStreamAttrID streamAttrID;
int *dataDevicePointer; dim3 threads(32, 32);
int *dataHostPointer; int *dataDevicePointer;
int dataSize; int *dataHostPointer;
int *bigDataDevicePointer; int dataSize;
int *bigDataHostPointer; int *bigDataDevicePointer;
int bigDataSize; int *bigDataHostPointer;
StopWatchInterface *timer = 0; int bigDataSize;
StopWatchInterface *timer = 0;
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
// use command-line specified CUDA device, otherwise use device with highest // use command-line specified CUDA device, otherwise use device with highest
// Gflops/s // Gflops/s
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
// Get device properties // Get device properties
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
dim3 blocks(deviceProp.maxGridSize[1], 1); dim3 blocks(deviceProp.maxGridSize[1], 1);
// Make sure device the l2 optimization // Make sure device the l2 optimization
if (deviceProp.persistingL2CacheMaxSize == 0) { if (deviceProp.persistingL2CacheMaxSize == 0) {
printf( printf("Waiving execution as device %d does not support persisting L2 "
"Waiving execution as device %d does not support persisting L2 " "Caching\n",
"Caching\n", devID);
devID); exit(EXIT_WAIVED);
exit(EXIT_WAIVED);
}
// Create stream to assiocate with window
checkCudaErrors(cudaStreamCreate(&stream));
// Set the amount of l2 cache that will be persisting to maximum the device
// can support
checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize,
deviceProp.persistingL2CacheMaxSize));
// Stream attribute to set
streamAttrID = cudaStreamAttributeAccessPolicyWindow;
// Default window
streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
accessPolicyWindow = initAccessPolicyWindow();
// Allocate size of both buffers
bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
// Allocate data
checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
checkCudaErrors(
cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
for (int i = 0; i < bigDataSize; ++i) {
if (i < dataSize) {
dataHostPointer[i] = i;
} }
bigDataHostPointer[bigDataSize - i - 1] = i; // Create stream to assiocate with window
} checkCudaErrors(cudaStreamCreate(&stream));
checkCudaErrors( // Set the amount of l2 cache that will be persisting to maximum the device
cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int))); // can support
checkCudaErrors( checkCudaErrors(cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize));
cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
checkCudaErrors(cudaMemcpyAsync(dataDevicePointer, dataHostPointer,
dataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(bigDataDevicePointer, bigDataHostPointer,
bigDataSize * sizeof(int),
cudaMemcpyHostToDevice, stream));
// Make a window for the buffer of interest // Stream attribute to set
accessPolicyWindow.base_ptr = (void *)dataDevicePointer; streamAttrID = cudaStreamAttributeAccessPolicyWindow;
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
accessPolicyWindow.hitRatio = 1.f;
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
// Assign window to stream // Default window
checkCudaErrors( streamAttrValue.accessPolicyWindow = initAccessPolicyWindow();
cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue)); accessPolicyWindow = initAccessPolicyWindow();
// Demote any previous persisting lines // Allocate size of both buffers
checkCudaErrors(cudaCtxResetPersistingL2Cache()); bigDataSize = (deviceProp.l2CacheSize * 4) / sizeof(int);
dataSize = (deviceProp.l2CacheSize / 4) / sizeof(int);
checkCudaErrors(cudaStreamSynchronize(stream)); // Allocate data
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>( checkCudaErrors(cudaMallocHost(&dataHostPointer, dataSize * sizeof(int)));
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF); checkCudaErrors(cudaMallocHost(&bigDataHostPointer, bigDataSize * sizeof(int)));
checkCudaErrors(cudaStreamSynchronize(stream)); for (int i = 0; i < bigDataSize; ++i) {
// check if kernel execution generated and error if (i < dataSize) {
getLastCudaError("Kernel execution failed"); dataHostPointer[i] = i;
}
// Free memory bigDataHostPointer[bigDataSize - i - 1] = i;
checkCudaErrors(cudaFreeHost(dataHostPointer)); }
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
checkCudaErrors(cudaFree(dataDevicePointer));
checkCudaErrors(cudaFree(bigDataDevicePointer));
sdkStopTimer(&timer); checkCudaErrors(cudaMalloc((void **)&dataDevicePointer, dataSize * sizeof(int)));
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); checkCudaErrors(cudaMalloc((void **)&bigDataDevicePointer, bigDataSize * sizeof(int)));
sdkDeleteTimer(&timer); checkCudaErrors(
cudaMemcpyAsync(dataDevicePointer, dataHostPointer, dataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(
bigDataDevicePointer, bigDataHostPointer, bigDataSize * sizeof(int), cudaMemcpyHostToDevice, stream));
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); // Make a window for the buffer of interest
accessPolicyWindow.base_ptr = (void *)dataDevicePointer;
accessPolicyWindow.num_bytes = dataSize * sizeof(int);
accessPolicyWindow.hitRatio = 1.f;
accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
accessPolicyWindow.missProp = cudaAccessPropertyNormal;
streamAttrValue.accessPolicyWindow = accessPolicyWindow;
// Assign window to stream
checkCudaErrors(cudaStreamSetAttribute(stream, streamAttrID, &streamAttrValue));
// Demote any previous persisting lines
checkCudaErrors(cudaCtxResetPersistingL2Cache());
checkCudaErrors(cudaStreamSynchronize(stream));
kernCacheSegmentTest<<<blocks, threads, 0, stream>>>(
dataDevicePointer, dataSize, bigDataDevicePointer, bigDataSize, 0xAFFFF);
checkCudaErrors(cudaStreamSynchronize(stream));
// check if kernel execution generated and error
getLastCudaError("Kernel execution failed");
// Free memory
checkCudaErrors(cudaFreeHost(dataHostPointer));
checkCudaErrors(cudaFreeHost(bigDataHostPointer));
checkCudaErrors(cudaFree(dataDevicePointer));
checkCudaErrors(cudaFree(bigDataDevicePointer));
sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

File diff suppressed because it is too large Load Diff

View File

@ -35,28 +35,30 @@ __device__ float clamp(float x, float a, float b) { return max(a, min(b, x)); }
__device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); } __device__ int clamp(int x, int a, int b) { return max(a, min(b, x)); }
// convert floating point rgb color to 8-bit integer // convert floating point rgb color to 8-bit integer
__device__ int rgbToInt(float r, float g, float b) { __device__ int rgbToInt(float r, float g, float b)
r = clamp(r, 0.0f, 255.0f); {
g = clamp(g, 0.0f, 255.0f); r = clamp(r, 0.0f, 255.0f);
b = clamp(b, 0.0f, 255.0f); g = clamp(g, 0.0f, 255.0f);
return (int(b) << 16) | (int(g) << 8) | int(r); b = clamp(b, 0.0f, 255.0f);
return (int(b) << 16) | (int(g) << 8) | int(r);
} }
__global__ void cudaProcess(unsigned int *g_odata, int imgw) { __global__ void cudaProcess(unsigned int *g_odata, int imgw)
extern __shared__ uchar4 sdata[]; {
extern __shared__ uchar4 sdata[];
int tx = threadIdx.x; int tx = threadIdx.x;
int ty = threadIdx.y; int ty = threadIdx.y;
int bw = blockDim.x; int bw = blockDim.x;
int bh = blockDim.y; int bh = blockDim.y;
int x = blockIdx.x * bw + tx; int x = blockIdx.x * bw + tx;
int y = blockIdx.y * bh + ty; int y = blockIdx.y * bh + ty;
uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0); uchar4 c4 = make_uchar4((x & 0x20) ? 100 : 0, 0, (y & 0x20) ? 100 : 0, 0);
g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x); g_odata[y * imgw + x] = rgbToInt(c4.z, c4.y, c4.x);
} }
extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, extern "C" void launch_cudaProcess(dim3 grid, dim3 block, int sbytes, unsigned int *g_odata, int imgw)
unsigned int *g_odata, int imgw) { {
cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw); cudaProcess<<<grid, block, sbytes>>>(g_odata, imgw);
} }

View File

@ -29,115 +29,124 @@
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// Create thread // Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); {
return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
} }
// Wait for thread to finish // Wait for thread to finish
void cutEndThread(CUTThread thread) { void cutEndThread(CUTThread thread)
WaitForSingleObject(thread, INFINITE); {
CloseHandle(thread); WaitForSingleObject(thread, INFINITE);
CloseHandle(thread);
} }
// Wait for multiple threads // Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) { void cutWaitForThreads(const CUTThread *threads, int num)
WaitForMultipleObjects(num, threads, true, INFINITE); {
WaitForMultipleObjects(num, threads, true, INFINITE);
for (int i = 0; i < num; i++) { for (int i = 0; i < num; i++) {
CloseHandle(threads[i]); CloseHandle(threads[i]);
} }
} }
// Create barrier. // Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) { CUTBarrier cutCreateBarrier(int releaseCount)
CUTBarrier barrier; {
CUTBarrier barrier;
InitializeCriticalSection(&barrier.criticalSection); InitializeCriticalSection(&barrier.criticalSection);
barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent")); barrier.barrierEvent = CreateEvent(NULL, TRUE, FALSE, TEXT("BarrierEvent"));
barrier.count = 0; barrier.count = 0;
barrier.releaseCount = releaseCount; barrier.releaseCount = releaseCount;
return barrier; return barrier;
} }
// Increment barrier. (execution continues) // Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) { void cutIncrementBarrier(CUTBarrier *barrier)
int myBarrierCount; {
EnterCriticalSection(&barrier->criticalSection); int myBarrierCount;
myBarrierCount = ++barrier->count; EnterCriticalSection(&barrier->criticalSection);
LeaveCriticalSection(&barrier->criticalSection); myBarrierCount = ++barrier->count;
LeaveCriticalSection(&barrier->criticalSection);
if (myBarrierCount >= barrier->releaseCount) { if (myBarrierCount >= barrier->releaseCount) {
SetEvent(barrier->barrierEvent); SetEvent(barrier->barrierEvent);
} }
} }
// Wait for barrier release. // Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) { void cutWaitForBarrier(CUTBarrier *barrier) { WaitForSingleObject(barrier->barrierEvent, INFINITE); }
WaitForSingleObject(barrier->barrierEvent, INFINITE);
}
// Destroy barrier // Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) {} void cutDestroyBarrier(CUTBarrier *barrier) {}
#else #else
// Create thread // Create thread
CUTThread cutStartThread(CUT_THREADROUTINE func, void *data) { CUTThread cutStartThread(CUT_THREADROUTINE func, void *data)
pthread_t thread; {
pthread_create(&thread, NULL, func, data); pthread_t thread;
return thread; pthread_create(&thread, NULL, func, data);
return thread;
} }
// Wait for thread to finish // Wait for thread to finish
void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); } void cutEndThread(CUTThread thread) { pthread_join(thread, NULL); }
// Wait for multiple threads // Wait for multiple threads
void cutWaitForThreads(const CUTThread *threads, int num) { void cutWaitForThreads(const CUTThread *threads, int num)
for (int i = 0; i < num; i++) { {
cutEndThread(threads[i]); for (int i = 0; i < num; i++) {
} cutEndThread(threads[i]);
}
} }
// Create barrier. // Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount) { CUTBarrier cutCreateBarrier(int releaseCount)
CUTBarrier barrier; {
CUTBarrier barrier;
barrier.count = 0; barrier.count = 0;
barrier.releaseCount = releaseCount; barrier.releaseCount = releaseCount;
pthread_mutex_init(&barrier.mutex, 0); pthread_mutex_init(&barrier.mutex, 0);
pthread_cond_init(&barrier.conditionVariable, 0); pthread_cond_init(&barrier.conditionVariable, 0);
return barrier; return barrier;
} }
// Increment barrier. (execution continues) // Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier) { void cutIncrementBarrier(CUTBarrier *barrier)
int myBarrierCount; {
pthread_mutex_lock(&barrier->mutex); int myBarrierCount;
myBarrierCount = ++barrier->count; pthread_mutex_lock(&barrier->mutex);
pthread_mutex_unlock(&barrier->mutex); myBarrierCount = ++barrier->count;
pthread_mutex_unlock(&barrier->mutex);
if (myBarrierCount >= barrier->releaseCount) { if (myBarrierCount >= barrier->releaseCount) {
pthread_cond_signal(&barrier->conditionVariable); pthread_cond_signal(&barrier->conditionVariable);
} }
} }
// Wait for barrier release. // Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier) { void cutWaitForBarrier(CUTBarrier *barrier)
pthread_mutex_lock(&barrier->mutex); {
pthread_mutex_lock(&barrier->mutex);
while (barrier->count < barrier->releaseCount) { while (barrier->count < barrier->releaseCount) {
pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex); pthread_cond_wait(&barrier->conditionVariable, &barrier->mutex);
} }
pthread_mutex_unlock(&barrier->mutex); pthread_mutex_unlock(&barrier->mutex);
} }
// Destroy barrier // Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier) { void cutDestroyBarrier(CUTBarrier *barrier)
pthread_mutex_destroy(&barrier->mutex); {
pthread_cond_destroy(&barrier->conditionVariable); pthread_mutex_destroy(&barrier->mutex);
pthread_cond_destroy(&barrier->conditionVariable);
} }
#endif #endif

View File

@ -37,15 +37,16 @@
typedef HANDLE CUTThread; typedef HANDLE CUTThread;
typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *); typedef unsigned(WINAPI *CUT_THREADROUTINE)(void *);
struct CUTBarrier { struct CUTBarrier
CRITICAL_SECTION criticalSection; {
HANDLE barrierEvent; CRITICAL_SECTION criticalSection;
int releaseCount; HANDLE barrierEvent;
int count; int releaseCount;
int count;
}; };
#define CUT_THREADPROC unsigned WINAPI #define CUT_THREADPROC unsigned WINAPI
#define CUT_THREADEND return 0 #define CUT_THREADEND return 0
#else #else
// POSIX threads. // POSIX threads.
@ -55,44 +56,46 @@ typedef pthread_t CUTThread;
typedef void *(*CUT_THREADROUTINE)(void *); typedef void *(*CUT_THREADROUTINE)(void *);
#define CUT_THREADPROC void * #define CUT_THREADPROC void *
#define CUT_THREADEND return 0 #define CUT_THREADEND return 0
struct CUTBarrier { struct CUTBarrier
pthread_mutex_t mutex; {
pthread_cond_t conditionVariable; pthread_mutex_t mutex;
int releaseCount; pthread_cond_t conditionVariable;
int count; int releaseCount;
int count;
}; };
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C"
{
#endif #endif
// Create thread. // Create thread.
CUTThread cutStartThread(CUT_THREADROUTINE, void *data); CUTThread cutStartThread(CUT_THREADROUTINE, void *data);
// Wait for thread to finish. // Wait for thread to finish.
void cutEndThread(CUTThread thread); void cutEndThread(CUTThread thread);
// Wait for multiple threads. // Wait for multiple threads.
void cutWaitForThreads(const CUTThread *threads, int num); void cutWaitForThreads(const CUTThread *threads, int num);
// Create barrier. // Create barrier.
CUTBarrier cutCreateBarrier(int releaseCount); CUTBarrier cutCreateBarrier(int releaseCount);
// Increment barrier. (execution continues) // Increment barrier. (execution continues)
void cutIncrementBarrier(CUTBarrier *barrier); void cutIncrementBarrier(CUTBarrier *barrier);
// Wait for barrier release. // Wait for barrier release.
void cutWaitForBarrier(CUTBarrier *barrier); void cutWaitForBarrier(CUTBarrier *barrier);
// Destroy barrier // Destroy barrier
void cutDestroyBarrier(CUTBarrier *barrier); void cutDestroyBarrier(CUTBarrier *barrier);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
#endif #endif
#endif // MULTITHREADING_H #endif // MULTITHREADING_H

View File

@ -43,172 +43,173 @@
#include <stdio.h> #include <stdio.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#include "multithreading.h" #include "multithreading.h"
const int N_workloads = 8; const int N_workloads = 8;
const int N_elements_per_workload = 100000; const int N_elements_per_workload = 100000;
CUTBarrier thread_barrier; CUTBarrier thread_barrier;
void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void CUDART_CB myStreamCallback(cudaStream_t event, cudaError_t status, void *data);
void *data);
struct heterogeneous_workload { struct heterogeneous_workload
int id; {
int cudaDeviceID; int id;
int cudaDeviceID;
int *h_data; int *h_data;
int *d_data; int *d_data;
cudaStream_t stream; cudaStream_t stream;
bool success; bool success;
}; };
__global__ void incKernel(int *data, int N) { __global__ void incKernel(int *data, int N)
int i = blockIdx.x * blockDim.x + threadIdx.x; {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) data[i]++; if (i < N)
data[i]++;
} }
CUT_THREADPROC launch(void *void_arg) { CUT_THREADPROC launch(void *void_arg)
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg; {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// Select GPU for this CPU thread // Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID)); checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// Allocate Resources // Allocate Resources
checkCudaErrors(cudaStreamCreate(&workload->stream)); checkCudaErrors(cudaStreamCreate(&workload->stream));
checkCudaErrors( checkCudaErrors(cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int)));
cudaMalloc(&workload->d_data, N_elements_per_workload * sizeof(int))); checkCudaErrors(cudaHostAlloc(&workload->h_data, N_elements_per_workload * sizeof(int), cudaHostAllocPortable));
checkCudaErrors(cudaHostAlloc(&workload->h_data,
N_elements_per_workload * sizeof(int),
cudaHostAllocPortable));
// CPU thread generates data // CPU thread generates data
for (int i = 0; i < N_elements_per_workload; ++i) { for (int i = 0; i < N_elements_per_workload; ++i) {
workload->h_data[i] = workload->id + i; workload->h_data[i] = workload->id + i;
}
// Schedule work for GPU in CUDA stream without blocking the CPU thread
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
dim3 block(512);
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
checkCudaErrors(cudaMemcpyAsync(workload->d_data, workload->h_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyHostToDevice, workload->stream));
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data,
N_elements_per_workload);
checkCudaErrors(cudaMemcpyAsync(workload->h_data, workload->d_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyDeviceToHost, workload->stream));
// New in CUDA 5.0: Add a CPU callback which is called once all currently
// pending operations in the CUDA stream have finished
checkCudaErrors(
cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
CUT_THREADEND;
// CPU thread end of life, GPU continues to process data...
}
CUT_THREADPROC postprocess(void *void_arg) {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// ... GPU is done with processing, continue on new CPU thread...
// Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
// CPU thread consumes results from GPU
workload->success = true;
for (int i = 0; i < N_workloads; ++i) {
workload->success &= workload->h_data[i] == i + workload->id + 1;
}
// Free Resources
checkCudaErrors(cudaFree(workload->d_data));
checkCudaErrors(cudaFreeHost(workload->h_data));
checkCudaErrors(cudaStreamDestroy(workload->stream));
// Signal the end of the heterogeneous workload to main thread
cutIncrementBarrier(&thread_barrier);
CUT_THREADEND;
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status,
void *data) {
// Check status of GPU after stream operations are done
checkCudaErrors(status);
// Spawn new CPU worker thread and continue processing on the CPU
cutStartThread(postprocess, data);
}
int main(int argc, char **argv) {
int N_gpus, max_gpus = 0;
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
printf("Starting simpleCallback\n");
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
printf("Found %d CUDA capable GPUs\n", N_gpus);
if (N_gpus > 32) {
printf("simpleCallback only supports 32 GPU(s)\n");
}
for (int devid = 0; devid < N_gpus; devid++) {
int SMversion;
cudaDeviceProp deviceProp;
cudaSetDevice(devid);
cudaGetDeviceProperties(&deviceProp, devid);
SMversion = deviceProp.major << 4 + deviceProp.minor;
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name,
deviceProp.major, deviceProp.minor);
printf(", %s GPU Callback Functions\n",
(SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) {
gpuInfo[max_gpus++] = devid;
} }
}
printf("%d GPUs available to run Callback Functions\n", max_gpus); // Schedule work for GPU in CUDA stream without blocking the CPU thread
// Note: Dedicated streams enable concurrent execution of workloads on the GPU
dim3 block(512);
dim3 grid((N_elements_per_workload + block.x - 1) / block.x);
heterogeneous_workload *workloads; checkCudaErrors(cudaMemcpyAsync(workload->d_data,
workloads = (heterogeneous_workload *)malloc(N_workloads * workload->h_data,
sizeof(heterogeneous_workload)); N_elements_per_workload * sizeof(int),
; cudaMemcpyHostToDevice,
thread_barrier = cutCreateBarrier(N_workloads); workload->stream));
incKernel<<<grid, block, 0, workload->stream>>>(workload->d_data, N_elements_per_workload);
checkCudaErrors(cudaMemcpyAsync(workload->h_data,
workload->d_data,
N_elements_per_workload * sizeof(int),
cudaMemcpyDeviceToHost,
workload->stream));
// Main thread spawns a CPU worker thread for each heterogeneous workload // New in CUDA 5.0: Add a CPU callback which is called once all currently
printf("Starting %d heterogeneous computing workloads\n", N_workloads); // pending operations in the CUDA stream have finished
checkCudaErrors(cudaStreamAddCallback(workload->stream, myStreamCallback, workload, 0));
for (int i = 0; i < N_workloads; ++i) { CUT_THREADEND;
workloads[i].id = i; // CPU thread end of life, GPU continues to process data...
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus; }
cutStartThread(launch, &workloads[i]); CUT_THREADPROC postprocess(void *void_arg)
} {
heterogeneous_workload *workload = (heterogeneous_workload *)void_arg;
// Sleep until all workloads have finished // ... GPU is done with processing, continue on new CPU thread...
cutWaitForBarrier(&thread_barrier);
printf("Total of %d workloads finished:\n", N_workloads); // Select GPU for this CPU thread
checkCudaErrors(cudaSetDevice(workload->cudaDeviceID));
bool success = true;
// CPU thread consumes results from GPU
for (int i = 0; i < N_workloads; ++i) { workload->success = true;
success &= workloads[i].success;
} for (int i = 0; i < N_workloads; ++i) {
workload->success &= workload->h_data[i] == i + workload->id + 1;
printf("%s\n", success ? "Success" : "Failure"); }
free(workloads); // Free Resources
checkCudaErrors(cudaFree(workload->d_data));
exit(success ? EXIT_SUCCESS : EXIT_FAILURE); checkCudaErrors(cudaFreeHost(workload->h_data));
checkCudaErrors(cudaStreamDestroy(workload->stream));
// Signal the end of the heterogeneous workload to main thread
cutIncrementBarrier(&thread_barrier);
CUT_THREADEND;
}
void CUDART_CB myStreamCallback(cudaStream_t stream, cudaError_t status, void *data)
{
// Check status of GPU after stream operations are done
checkCudaErrors(status);
// Spawn new CPU worker thread and continue processing on the CPU
cutStartThread(postprocess, data);
}
int main(int argc, char **argv)
{
int N_gpus, max_gpus = 0;
int gpuInfo[32]; // assume a maximum of 32 GPUs in a system configuration
printf("Starting simpleCallback\n");
checkCudaErrors(cudaGetDeviceCount(&N_gpus));
printf("Found %d CUDA capable GPUs\n", N_gpus);
if (N_gpus > 32) {
printf("simpleCallback only supports 32 GPU(s)\n");
}
for (int devid = 0; devid < N_gpus; devid++) {
int SMversion;
cudaDeviceProp deviceProp;
cudaSetDevice(devid);
cudaGetDeviceProperties(&deviceProp, devid);
SMversion = deviceProp.major << 4 + deviceProp.minor;
printf("GPU[%d] %s supports SM %d.%d", devid, deviceProp.name, deviceProp.major, deviceProp.minor);
printf(", %s GPU Callback Functions\n", (SMversion >= 0x11) ? "capable" : "NOT capable");
if (SMversion >= 0x11) {
gpuInfo[max_gpus++] = devid;
}
}
printf("%d GPUs available to run Callback Functions\n", max_gpus);
heterogeneous_workload *workloads;
workloads = (heterogeneous_workload *)malloc(N_workloads * sizeof(heterogeneous_workload));
;
thread_barrier = cutCreateBarrier(N_workloads);
// Main thread spawns a CPU worker thread for each heterogeneous workload
printf("Starting %d heterogeneous computing workloads\n", N_workloads);
for (int i = 0; i < N_workloads; ++i) {
workloads[i].id = i;
workloads[i].cudaDeviceID = gpuInfo[i % max_gpus]; // i % N_gpus;
cutStartThread(launch, &workloads[i]);
}
// Sleep until all workloads have finished
cutWaitForBarrier(&thread_barrier);
printf("Total of %d workloads finished:\n", N_workloads);
bool success = true;
for (int i = 0; i < N_workloads; ++i) {
success &= workloads[i].success;
}
printf("%s\n", success ? "Success" : "Failure");
free(workloads);
exit(success ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -38,8 +38,8 @@
* *
*/ */
#include <stdio.h>
#include <cooperative_groups.h> #include <cooperative_groups.h>
#include <stdio.h>
using namespace cooperative_groups; using namespace cooperative_groups;
@ -49,35 +49,36 @@ using namespace cooperative_groups;
* calculates the sum of val across the group g. The workspace array, x, * calculates the sum of val across the group g. The workspace array, x,
* must be large enough to contain g.size() integers. * must be large enough to contain g.size() integers.
*/ */
__device__ int sumReduction(thread_group g, int *x, int val) { __device__ int sumReduction(thread_group g, int *x, int val)
// rank of this thread in the group {
int lane = g.thread_rank(); // rank of this thread in the group
int lane = g.thread_rank();
// for each iteration of this loop, the number of threads active in the // for each iteration of this loop, the number of threads active in the
// reduction, i, is halved, and each active thread (with index [lane]) // reduction, i, is halved, and each active thread (with index [lane])
// performs a single summation of it's own value with that // performs a single summation of it's own value with that
// of a "partner" (with index [lane+i]). // of a "partner" (with index [lane+i]).
for (int i = g.size() / 2; i > 0; i /= 2) { for (int i = g.size() / 2; i > 0; i /= 2) {
// store value for this thread in temporary array // store value for this thread in temporary array
x[lane] = val; x[lane] = val;
// synchronize all threads in group // synchronize all threads in group
g.sync(); g.sync();
if (lane < i) if (lane < i)
// active threads perform summation of their value with // active threads perform summation of their value with
// their partner's value // their partner's value
val += x[lane + i]; val += x[lane + i];
// synchronize all threads in group // synchronize all threads in group
g.sync(); g.sync();
} }
// master thread in group returns result, and others return -1. // master thread in group returns result, and others return -1.
if (g.thread_rank() == 0) if (g.thread_rank() == 0)
return val; return val;
else else
return -1; return -1;
} }
/** /**
@ -85,93 +86,92 @@ __device__ int sumReduction(thread_group g, int *x, int val) {
* *
* Creates cooperative groups and performs reductions * Creates cooperative groups and performs reductions
*/ */
__global__ void cgkernel() { __global__ void cgkernel()
// threadBlockGroup includes all threads in the block {
thread_block threadBlockGroup = this_thread_block(); // threadBlockGroup includes all threads in the block
int threadBlockGroupSize = threadBlockGroup.size(); thread_block threadBlockGroup = this_thread_block();
int threadBlockGroupSize = threadBlockGroup.size();
// workspace array in shared memory required for reduction // workspace array in shared memory required for reduction
extern __shared__ int workspace[]; extern __shared__ int workspace[];
int input, output, expectedOutput; int input, output, expectedOutput;
// input to reduction, for each thread, is its' rank in the group // input to reduction, for each thread, is its' rank in the group
input = threadBlockGroup.thread_rank(); input = threadBlockGroup.thread_rank();
// expected output from analytical formula (n-1)(n)/2 // expected output from analytical formula (n-1)(n)/2
// (noting that indexing starts at 0 rather than 1) // (noting that indexing starts at 0 rather than 1)
expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2; expectedOutput = (threadBlockGroupSize - 1) * threadBlockGroupSize / 2;
// perform reduction // perform reduction
output = sumReduction(threadBlockGroup, workspace, input); output = sumReduction(threadBlockGroup, workspace, input);
// master thread in group prints out result // master thread in group prints out result
if (threadBlockGroup.thread_rank() == 0) { if (threadBlockGroup.thread_rank() == 0) {
printf( printf(" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n",
" Sum of all ranks 0..%d in threadBlockGroup is %d (expected %d)\n\n", (int)threadBlockGroup.size() - 1,
(int)threadBlockGroup.size() - 1, output, expectedOutput); output,
expectedOutput);
printf(" Now creating %d groups, each of size 16 threads:\n\n", printf(" Now creating %d groups, each of size 16 threads:\n\n", (int)threadBlockGroup.size() / 16);
(int)threadBlockGroup.size() / 16); }
}
threadBlockGroup.sync(); threadBlockGroup.sync();
// each tiledPartition16 group includes 16 threads // each tiledPartition16 group includes 16 threads
thread_block_tile<16> tiledPartition16 = thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup);
tiled_partition<16>(threadBlockGroup);
// This offset allows each group to have its own unique area in the workspace // This offset allows each group to have its own unique area in the workspace
// array // array
int workspaceOffset = int workspaceOffset = threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
threadBlockGroup.thread_rank() - tiledPartition16.thread_rank();
// input to reduction, for each thread, is its' rank in the group // input to reduction, for each thread, is its' rank in the group
input = tiledPartition16.thread_rank(); input = tiledPartition16.thread_rank();
// expected output from analytical formula (n-1)(n)/2 // expected output from analytical formula (n-1)(n)/2
// (noting that indexing starts at 0 rather than 1) // (noting that indexing starts at 0 rather than 1)
expectedOutput = 15 * 16 / 2; expectedOutput = 15 * 16 / 2;
// Perform reduction // Perform reduction
output = sumReduction(tiledPartition16, workspace + workspaceOffset, input); output = sumReduction(tiledPartition16, workspace + workspaceOffset, input);
// each master thread prints out result // each master thread prints out result
if (tiledPartition16.thread_rank() == 0) if (tiledPartition16.thread_rank() == 0)
printf( printf(" Sum of all ranks 0..15 in this tiledPartition16 group is %d "
" Sum of all ranks 0..15 in this tiledPartition16 group is %d " "(expected %d)\n",
"(expected %d)\n", output,
output, expectedOutput); expectedOutput);
return; return;
} }
/** /**
* Host main routine * Host main routine
*/ */
int main() { int main()
// Error code to check return values for CUDA calls {
cudaError_t err; // Error code to check return values for CUDA calls
cudaError_t err;
// Launch the kernel // Launch the kernel
int blocksPerGrid = 1; int blocksPerGrid = 1;
int threadsPerBlock = 64; int threadsPerBlock = 64;
printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock); printf("\nLaunching a single block with %d threads...\n\n", threadsPerBlock);
// we use the optional third argument to specify the size // we use the optional third argument to specify the size
// of shared memory required in the kernel // of shared memory required in the kernel
cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>(); cgkernel<<<blocksPerGrid, threadsPerBlock, threadsPerBlock * sizeof(int)>>>();
err = cudaDeviceSynchronize(); err = cudaDeviceSynchronize();
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch kernel (error code %s)!\n", fprintf(stderr, "Failed to launch kernel (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err)); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
printf("\n...Done.\n\n"); printf("\n...Done.\n\n");
return 0; return 0;
} }

View File

@ -26,27 +26,27 @@
*/ */
/* /*
* This sample demonstrates how to use texture fetches from layered 2D textures * This sample demonstrates how to use texture fetches from layered 2D textures
* in CUDA C * in CUDA C
* *
* This sample first generates a 3D input data array for the layered texture * This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer, * and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates) * which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array. * transform it to the expected output, and write it to a 3D output data array.
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA // includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
static const char *sSDKname = "simpleCubemapTexture"; static const char *sSDKname = "simpleCubemapTexture";
@ -56,213 +56,207 @@ static const char *sSDKname = "simpleCubemapTexture";
//! Transform a cubemap face of a linear buffe using cubemap texture lookups //! Transform a cubemap face of a linear buffe using cubemap texture lookups
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *g_odata, int width, __global__ void transformKernel(float *g_odata, int width, cudaTextureObject_t tex)
cudaTextureObject_t tex) { {
// calculate this thread's data point // calculate this thread's data point
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// 0.5f offset and division are necessary to access the original data points // 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated). // in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D // For details, see also CUDA Programming Guide, Appendix D
float u = ((x + 0.5f) / (float)width) * 2.f - 1.f; float u = ((x + 0.5f) / (float)width) * 2.f - 1.f;
float v = ((y + 0.5f) / (float)width) * 2.f - 1.f; float v = ((y + 0.5f) / (float)width) * 2.f - 1.f;
float cx, cy, cz; float cx, cy, cz;
for (unsigned int face = 0; face < 6; face++) { for (unsigned int face = 0; face < 6; face++) {
// Layer 0 is positive X face // Layer 0 is positive X face
if (face == 0) { if (face == 0) {
cx = 1; cx = 1;
cy = -v; cy = -v;
cz = -u; cz = -u;
} }
// Layer 1 is negative X face // Layer 1 is negative X face
else if (face == 1) { else if (face == 1) {
cx = -1; cx = -1;
cy = -v; cy = -v;
cz = u; cz = u;
} }
// Layer 2 is positive Y face // Layer 2 is positive Y face
else if (face == 2) { else if (face == 2) {
cx = u; cx = u;
cy = 1; cy = 1;
cz = v; cz = v;
} }
// Layer 3 is negative Y face // Layer 3 is negative Y face
else if (face == 3) { else if (face == 3) {
cx = u; cx = u;
cy = -1; cy = -1;
cz = -v; cz = -v;
} }
// Layer 4 is positive Z face // Layer 4 is positive Z face
else if (face == 4) { else if (face == 4) {
cx = u; cx = u;
cy = -v; cy = -v;
cz = 1; cz = 1;
} }
// Layer 4 is negative Z face // Layer 4 is negative Z face
else if (face == 5) { else if (face == 5) {
cx = -u; cx = -u;
cy = -v; cy = -v;
cz = -1; cz = -1;
} }
// read from texture, do expected transformation and write to global memory // read from texture, do expected transformation and write to global memory
g_odata[face * width * width + y * width + x] = g_odata[face * width * width + y * width + x] = -texCubemap<float>(tex, cx, cy, cz);
-texCubemap<float>(tex, cx, cy, cz); }
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
// use command-line specified CUDA device, otherwise use device with highest {
// Gflops/s // use command-line specified CUDA device, otherwise use device with highest
int devID = findCudaDevice(argc, (const char **)argv); // Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
bool bResult = true; bool bResult = true;
// get number of SMs on this GPU // get number of SMs on this GPU
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
deviceProps.multiProcessorCount); printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
if (deviceProps.major < 2) { if (deviceProps.major < 2) {
printf( printf("%s requires SM 2.0 or higher for support of Texture Arrays. Test "
"%s requires SM 2.0 or higher for support of Texture Arrays. Test " "will exit... \n",
"will exit... \n", sSDKname);
sSDKname);
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
}
// generate input data for layered texture
unsigned int width = 64, num_faces = 6, num_layers = 1;
unsigned int cubemap_size = width * width * num_faces;
unsigned int size = cubemap_size * num_layers * sizeof(float);
float *h_data = (float *)malloc(size);
for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
h_data[i] = (float)i;
}
// this is the expected transformation of the input data (the expected output)
float *h_data_ref = (float *)malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++) {
for (int i = 0; i < (int)(cubemap_size); i++) {
h_data_ref[layer * cubemap_size + i] =
-h_data[layer * cubemap_size + i] + layer;
} }
}
// allocate device memory for result // generate input data for layered texture
float *d_data = NULL; unsigned int width = 64, num_faces = 6, num_layers = 1;
checkCudaErrors(cudaMalloc((void **)&d_data, size)); unsigned int cubemap_size = width * width * num_faces;
unsigned int size = cubemap_size * num_layers * sizeof(float);
float *h_data = (float *)malloc(size);
// allocate array and copy image data for (int i = 0; i < (int)(cubemap_size * num_layers); i++) {
cudaChannelFormatDesc channelDesc = h_data[i] = (float)i;
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); }
cudaArray *cu_3darray;
// checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
// make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
make_cudaExtent(width, width, num_faces),
cudaArrayCubemap));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr =
make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, width, num_faces);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
cudaTextureObject_t tex; // this is the expected transformation of the input data (the expected output)
cudaResourceDesc texRes; float *h_data_ref = (float *)malloc(size);
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray; for (unsigned int layer = 0; layer < num_layers; layer++) {
texRes.res.array.array = cu_3darray; for (int i = 0; i < (int)(cubemap_size); i++) {
h_data_ref[layer * cubemap_size + i] = -h_data[layer * cubemap_size + i] + layer;
}
}
cudaTextureDesc texDescr; // allocate device memory for result
memset(&texDescr, 0, sizeof(cudaTextureDesc)); float *d_data = NULL;
checkCudaErrors(cudaMalloc((void **)&d_data, size));
texDescr.normalizedCoords = true; // allocate array and copy image data
texDescr.filterMode = cudaFilterModeLinear; cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
texDescr.addressMode[0] = cudaAddressModeWrap; cudaArray *cu_3darray;
texDescr.addressMode[1] = cudaAddressModeWrap; // checkCudaErrors(cudaMalloc3DArray( &cu_3darray, &channelDesc,
texDescr.addressMode[2] = cudaAddressModeWrap; // make_cudaExtent(width, height, num_layers), cudaArrayLayered ));
texDescr.readMode = cudaReadModeElementType; checkCudaErrors(
cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, width, num_faces), cudaArrayCubemap));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, width);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, width, num_faces);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
dim3 dimBlock(8, 8, 1); texRes.resType = cudaResourceTypeArray;
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1); texRes.res.array.array = cu_3darray;
printf( cudaTextureDesc texDescr;
"Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each " memset(&texDescr, 0, sizeof(cudaTextureDesc));
"block has 8 x 8 threads\n",
width, num_layers, dimGrid.x, dimGrid.y);
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, texDescr.normalizedCoords = true;
tex); // warmup (for better timing) texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.addressMode[2] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
// check if kernel execution generated an error checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize()); dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, width / dimBlock.y, 1);
StopWatchInterface *timer = NULL; printf("Covering Cubemap data array of %d~3 x %d: Grid size is %d x %d, each "
sdkCreateTimer(&timer); "block has 8 x 8 threads\n",
sdkStartTimer(&timer); width,
num_layers,
dimGrid.x,
dimGrid.y);
// execute the kernel transformKernel<<<dimGrid, dimBlock>>>(d_data, width,
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex); tex); // warmup (for better timing)
// check if kernel execution generated an error // check if kernel execution generated an error
getLastCudaError("Kernel execution failed"); getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n",
(cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side StopWatchInterface *timer = NULL;
float *h_odata = (float *)malloc(size); sdkCreateTimer(&timer);
// copy result from device to host sdkStartTimer(&timer);
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
// write regression file if necessary // execute the kernel
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, tex);
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, // check if kernel execution generated an error
false); getLastCudaError("Kernel execution failed");
} else {
printf("Comparing kernel output to expected data\n"); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (cubemap_size / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side
float *h_odata = (float *)malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
// write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * width, 0.0f, false);
}
else {
printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f #define MIN_EPSILON_ERROR 5e-3f
bResult = bResult = compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f);
compareData(h_odata, h_data_ref, cubemap_size, MIN_EPSILON_ERROR, 0.0f); }
}
// cleanup memory // cleanup memory
free(h_data); free(h_data);
free(h_data_ref); free(h_data_ref);
free(h_odata); free(h_odata);
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(d_data)); checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray)); checkCudaErrors(cudaFreeArray(cu_3darray));
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -33,12 +33,12 @@
*/ */
// Includes // Includes
#include <cstring>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <iostream>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <cstring>
#include <iostream>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
@ -62,165 +62,165 @@ float *d_B;
float *d_C; float *d_C;
// Functions // Functions
int CleanupNoFailure(CUcontext &cuContext); int CleanupNoFailure(CUcontext &cuContext);
void RandomInit(float *, int); void RandomInit(float *, int);
bool findModulePath(const char *, string &, char **, ostringstream &); bool findModulePath(const char *, string &, char **, ostringstream &);
static void check(CUresult result, char const *const func, static void check(CUresult result, char const *const func, const char *const file, int const line)
const char *const file, int const line) { {
if (result) { if (result) {
fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, fprintf(stderr, "CUDA error at %s:%d code=%d \"%s\" \n", file, line, static_cast<unsigned int>(result), func);
static_cast<unsigned int>(result), func); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
} }
#define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__) #define checkCudaDrvErrors(val) check((val), #val, __FILE__, __LINE__)
// Host code // Host code
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("simpleDrvRuntime..\n"); {
int N = 50000, devID = 0; printf("simpleDrvRuntime..\n");
size_t size = N * sizeof(float); int N = 50000, devID = 0;
CUdevice cuDevice; size_t size = N * sizeof(float);
CUfunction vecAdd_kernel; CUdevice cuDevice;
CUmodule cuModule = 0; CUfunction vecAdd_kernel;
CUcontext cuContext; CUmodule cuModule = 0;
CUcontext cuContext;
// Initialize // Initialize
checkCudaDrvErrors(cuInit(0)); checkCudaDrvErrors(cuInit(0));
cuDevice = findCudaDevice(argc, (const char **)argv); cuDevice = findCudaDevice(argc, (const char **)argv);
// Create context // Create context
checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice)); checkCudaDrvErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results // first search for the module path before we load the results
string module_path; string module_path;
ostringstream fatbin; ostringstream fatbin;
if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findModulePath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); else {
} printf("> initCUDA loading module: <%s>\n", module_path.c_str());
if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE);
}
// Create module from binary file (FATBIN)
checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module
checkCudaDrvErrors(
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// Allocate input vectors h_A and h_B in host memory
checkCudaErrors(cudaMallocHost(&h_A, size));
checkCudaErrors(cudaMallocHost(&h_B, size));
checkCudaErrors(cudaMallocHost(&h_C, size));
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
checkCudaErrors(cudaMalloc((void **)(&d_A), size));
checkCudaErrors(cudaMalloc((void **)(&d_B), size));
checkCudaErrors(cudaMalloc((void **)(&d_C), size));
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// Copy vectors from host memory to device memory
checkCudaErrors(
cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(
cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel
checkCudaDrvErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1,
threadsPerBlock, 1, 1, 0, stream, args,
NULL));
// Copy result from device memory to host memory
// h_C contains the result in host memory
checkCudaErrors(
cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f) {
break;
} }
}
checkCudaDrvErrors(cuModuleUnload(cuModule)); if (!fatbin.str().size()) {
CleanupNoFailure(cuContext); printf("fatbin file empty. exiting..\n");
printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); exit(EXIT_FAILURE);
}
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); // Create module from binary file (FATBIN)
checkCudaDrvErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module
checkCudaDrvErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// Allocate input vectors h_A and h_B in host memory
checkCudaErrors(cudaMallocHost(&h_A, size));
checkCudaErrors(cudaMallocHost(&h_B, size));
checkCudaErrors(cudaMallocHost(&h_C, size));
// Initialize input vectors
RandomInit(h_A, N);
RandomInit(h_B, N);
// Allocate vectors in device memory
checkCudaErrors(cudaMalloc((void **)(&d_A), size));
checkCudaErrors(cudaMalloc((void **)(&d_B), size));
checkCudaErrors(cudaMalloc((void **)(&d_C), size));
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
// Copy vectors from host memory to device memory
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, size, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, size, cudaMemcpyHostToDevice, stream));
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel
checkCudaDrvErrors(
cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, stream, args, NULL));
// Copy result from device memory to host memory
// h_C contains the result in host memory
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, size, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
// Verify result
int i;
for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f) {
break;
}
}
checkCudaDrvErrors(cuModuleUnload(cuModule));
CleanupNoFailure(cuContext);
printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int CleanupNoFailure(CUcontext &cuContext) { int CleanupNoFailure(CUcontext &cuContext)
// Free device memory {
checkCudaErrors(cudaFree(d_A)); // Free device memory
checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_C)); checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
// Free host memory // Free host memory
if (h_A) { if (h_A) {
checkCudaErrors(cudaFreeHost(h_A)); checkCudaErrors(cudaFreeHost(h_A));
} }
if (h_B) { if (h_B) {
checkCudaErrors(cudaFreeHost(h_B)); checkCudaErrors(cudaFreeHost(h_B));
} }
if (h_C) { if (h_C) {
checkCudaErrors(cudaFreeHost(h_C)); checkCudaErrors(cudaFreeHost(h_C));
} }
checkCudaDrvErrors(cuCtxDestroy(cuContext)); checkCudaDrvErrors(cuCtxDestroy(cuContext));
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
// Allocates an array with random float entries. // Allocates an array with random float entries.
void RandomInit(float *data, int n) { void RandomInit(float *data, int n)
for (int i = 0; i < n; ++i) { {
data[i] = rand() / (float)RAND_MAX; for (int i = 0; i < n; ++i) {
} data[i] = rand() / (float)RAND_MAX;
} }
}
bool inline findModulePath(const char *module_file, string &module_path,
char **argv, ostringstream &ostrm) { bool inline findModulePath(const char *module_file, string &module_path, char **argv, ostringstream &ostrm)
char *actual_path = sdkFindFilePath(module_file, argv[0]); {
char *actual_path = sdkFindFilePath(module_file, argv[0]);
if (actual_path) {
module_path = actual_path; if (actual_path) {
} else { module_path = actual_path;
printf("> findModulePath file not found: <%s> \n", module_file); }
return false; else {
} printf("> findModulePath file not found: <%s> \n", module_file);
return false;
if (module_path.empty()) { }
printf("> findModulePath could not find file: <%s> \n", module_file);
return false; if (module_path.empty()) {
} else { printf("> findModulePath could not find file: <%s> \n", module_file);
printf("> findModulePath found file at <%s>\n", module_path.c_str()); return false;
if (module_path.rfind("fatbin") != string::npos) { }
ifstream fileIn(module_path.c_str(), ios::binary); else {
ostrm << fileIn.rdbuf(); printf("> findModulePath found file at <%s>\n", module_path.c_str());
if (module_path.rfind("fatbin") != string::npos) {
ifstream fileIn(module_path.c_str(), ios::binary);
ostrm << fileIn.rdbuf();
}
return true;
} }
return true;
}
} }

View File

@ -34,9 +34,10 @@
*/ */
// Device code // Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
float *C, int N) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i]; if (i < N)
C[i] = A[i] + B[i];
} }

View File

@ -44,188 +44,188 @@ const char *sSDKsample = "hyperQ";
// This subroutine does no real work but runs for at least the specified number // This subroutine does no real work but runs for at least the specified number
// of clock ticks. // of clock ticks.
__device__ void clock_block(clock_t *d_o, clock_t clock_count) { __device__ void clock_block(clock_t *d_o, clock_t clock_count)
unsigned int start_clock = (unsigned int)clock(); {
unsigned int start_clock = (unsigned int)clock();
clock_t clock_offset = 0; clock_t clock_offset = 0;
while (clock_offset < clock_count) { while (clock_offset < clock_count) {
unsigned int end_clock = (unsigned int)clock(); unsigned int end_clock = (unsigned int)clock();
// The code below should work like // The code below should work like
// this (thanks to modular arithmetics): // this (thanks to modular arithmetics):
// //
// clock_offset = (clock_t) (end_clock > start_clock ? // clock_offset = (clock_t) (end_clock > start_clock ?
// end_clock - start_clock : // end_clock - start_clock :
// end_clock + (0xffffffffu - start_clock)); // end_clock + (0xffffffffu - start_clock));
// //
// Indeed, let m = 2^32 then // Indeed, let m = 2^32 then
// end - start = end + m - start (mod m). // end - start = end + m - start (mod m).
clock_offset = (clock_t)(end_clock - start_clock); clock_offset = (clock_t)(end_clock - start_clock);
} }
d_o[0] = clock_offset; d_o[0] = clock_offset;
} }
// We create two identical kernels calling clock_block(), we create two so that // We create two identical kernels calling clock_block(), we create two so that
// we can identify dependencies in the profile timeline ("kernel_B" is always // we can identify dependencies in the profile timeline ("kernel_B" is always
// dependent on "kernel_A" in the same stream). // dependent on "kernel_A" in the same stream).
__global__ void kernel_A(clock_t *d_o, clock_t clock_count) { __global__ void kernel_A(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
clock_block(d_o, clock_count); __global__ void kernel_B(clock_t *d_o, clock_t clock_count) { clock_block(d_o, clock_count); }
}
__global__ void kernel_B(clock_t *d_o, clock_t clock_count) {
clock_block(d_o, clock_count);
}
// Single-warp reduction kernel (note: this is not optimized for simplicity) // Single-warp reduction kernel (note: this is not optimized for simplicity)
__global__ void sum(clock_t *d_clocks, int N) { __global__ void sum(clock_t *d_clocks, int N)
// Handle to thread block group {
cg::thread_block cta = cg::this_thread_block(); // Handle to thread block group
__shared__ clock_t s_clocks[32]; cg::thread_block cta = cg::this_thread_block();
__shared__ clock_t s_clocks[32];
clock_t my_sum = 0; clock_t my_sum = 0;
for (int i = threadIdx.x; i < N; i += blockDim.x) { for (int i = threadIdx.x; i < N; i += blockDim.x) {
my_sum += d_clocks[i]; my_sum += d_clocks[i];
}
s_clocks[threadIdx.x] = my_sum;
cg::sync(cta);
for (int i = warpSize / 2; i > 0; i /= 2) {
if (threadIdx.x < i) {
s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
} }
s_clocks[threadIdx.x] = my_sum;
cg::sync(cta); cg::sync(cta);
}
if (threadIdx.x == 0) { for (int i = warpSize / 2; i > 0; i /= 2) {
d_clocks[0] = s_clocks[0]; if (threadIdx.x < i) {
} s_clocks[threadIdx.x] += s_clocks[threadIdx.x + i];
} }
int main(int argc, char **argv) { cg::sync(cta);
int nstreams = 32; // One stream for each pair of kernels
float kernel_time = 10; // Time each kernel should run in ms
float elapsed_time;
int cuda_device = 0;
printf("starting %s...\n", sSDKsample);
// Get number of streams (if overridden on the command line)
if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
}
// Use command-line specified CUDA device, otherwise use device with
// highest Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
// Get device properties
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDevice(&cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
// HyperQ is available in devices of Compute Capability 3.5 and higher
if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
if (deviceProp.concurrentKernels == 0) {
printf(
"> GPU does not support concurrent kernel execution (SM 3.5 or "
"higher required)\n");
printf(" CUDA kernel runs will be serialized\n");
} else {
printf("> GPU does not support HyperQ\n");
printf(" CUDA kernel runs will have limited concurrency\n");
} }
}
printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n", if (threadIdx.x == 0) {
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount); d_clocks[0] = s_clocks[0];
}
// Allocate host memory for the output (reduced to a single value) }
clock_t *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t))); int main(int argc, char **argv)
{
// Allocate device memory for the output (one value for each kernel) int nstreams = 32; // One stream for each pair of kernels
clock_t *d_a = 0; float kernel_time = 10; // Time each kernel should run in ms
checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t))); float elapsed_time;
int cuda_device = 0;
// Allocate and initialize an array of stream handles
cudaStream_t *streams = printf("starting %s...\n", sSDKsample);
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
// Get number of streams (if overridden on the command line)
for (int i = 0; i < nstreams; i++) { if (checkCmdLineFlag(argc, (const char **)argv, "nstreams")) {
checkCudaErrors(cudaStreamCreate(&(streams[i]))); nstreams = getCmdLineArgumentInt(argc, (const char **)argv, "nstreams");
} }
// Create CUDA event handles // Use command-line specified CUDA device, otherwise use device with
cudaEvent_t start_event, stop_event; // highest Gflops/s
checkCudaErrors(cudaEventCreate(&start_event)); cuda_device = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaEventCreate(&stop_event));
// Get device properties
// Target time per kernel is kernel_time ms, clockRate is in KHz cudaDeviceProp deviceProp;
// Target number of clocks = target time * clock frequency checkCudaErrors(cudaGetDevice(&cuda_device));
#if defined(__arm__) || defined(__aarch64__) checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
// the kernel takes more time than the channel reset time on arm archs, so to
// prevent hangs reduce time_clocks. // HyperQ is available in devices of Compute Capability 3.5 and higher
clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100)); if (deviceProp.major < 3 || (deviceProp.major == 3 && deviceProp.minor < 5)) {
#else if (deviceProp.concurrentKernels == 0) {
clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate); printf("> GPU does not support concurrent kernel execution (SM 3.5 or "
#endif "higher required)\n");
clock_t total_clocks = 0; printf(" CUDA kernel runs will be serialized\n");
}
// Start the clock else {
checkCudaErrors(cudaEventRecord(start_event, 0)); printf("> GPU does not support HyperQ\n");
printf(" CUDA kernel runs will have limited concurrency\n");
// Queue pairs of {kernel_A, kernel_B} in separate streams }
for (int i = 0; i < nstreams; ++i) { }
kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
total_clocks += time_clocks; printf("> Detected Compute SM %d.%d hardware with %d multi-processors\n",
kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks); deviceProp.major,
total_clocks += time_clocks; deviceProp.minor,
} deviceProp.multiProcessorCount);
// Stop the clock in stream 0 (i.e. all previous kernels will be complete) // Allocate host memory for the output (reduced to a single value)
checkCudaErrors(cudaEventRecord(stop_event, 0)); clock_t *a = 0;
checkCudaErrors(cudaMallocHost((void **)&a, sizeof(clock_t)));
// At this point the CPU has dispatched all work for the GPU and can
// continue processing other tasks in parallel. In this sample we just want // Allocate device memory for the output (one value for each kernel)
// to wait until all work is done so we use a blocking cudaMemcpy below. clock_t *d_a = 0;
checkCudaErrors(cudaMalloc((void **)&d_a, 2 * nstreams * sizeof(clock_t)));
// Run the sum kernel and copy the result back to host
sum<<<1, 32>>>(d_a, 2 * nstreams); // Allocate and initialize an array of stream handles
checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost)); cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
// stop_event will have been recorded but including the synchronize here to for (int i = 0; i < nstreams; i++) {
// prevent copy/paste errors! checkCudaErrors(cudaStreamCreate(&(streams[i])));
checkCudaErrors(cudaEventSynchronize(stop_event)); }
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
// Create CUDA event handles
printf( cudaEvent_t start_event, stop_event;
"Expected time for serial execution of %d sets of kernels is between " checkCudaErrors(cudaEventCreate(&start_event));
"approx. %.3fs and %.3fs\n", checkCudaErrors(cudaEventCreate(&stop_event));
nstreams, (nstreams + 1) * kernel_time / 1000.0f,
2 * nstreams * kernel_time / 1000.0f); // Target time per kernel is kernel_time ms, clockRate is in KHz
printf( // Target number of clocks = target time * clock frequency
"Expected time for fully concurrent execution of %d sets of kernels is " #if defined(__arm__) || defined(__aarch64__)
"approx. %.3fs\n", // the kernel takes more time than the channel reset time on arm archs, so to
nstreams, 2 * kernel_time / 1000.0f); // prevent hangs reduce time_clocks.
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f); clock_t time_clocks = (clock_t)(kernel_time * (deviceProp.clockRate / 100));
#else
bool bTestResult = (a[0] >= total_clocks); clock_t time_clocks = (clock_t)(kernel_time * deviceProp.clockRate);
#endif
// Release resources clock_t total_clocks = 0;
for (int i = 0; i < nstreams; i++) {
cudaStreamDestroy(streams[i]); // Start the clock
} checkCudaErrors(cudaEventRecord(start_event, 0));
free(streams); // Queue pairs of {kernel_A, kernel_B} in separate streams
cudaEventDestroy(start_event); for (int i = 0; i < nstreams; ++i) {
cudaEventDestroy(stop_event); kernel_A<<<1, 1, 0, streams[i]>>>(&d_a[2 * i], time_clocks);
cudaFreeHost(a); total_clocks += time_clocks;
cudaFree(d_a); kernel_B<<<1, 1, 0, streams[i]>>>(&d_a[2 * i + 1], time_clocks);
total_clocks += time_clocks;
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); }
// Stop the clock in stream 0 (i.e. all previous kernels will be complete)
checkCudaErrors(cudaEventRecord(stop_event, 0));
// At this point the CPU has dispatched all work for the GPU and can
// continue processing other tasks in parallel. In this sample we just want
// to wait until all work is done so we use a blocking cudaMemcpy below.
// Run the sum kernel and copy the result back to host
sum<<<1, 32>>>(d_a, 2 * nstreams);
checkCudaErrors(cudaMemcpy(a, d_a, sizeof(clock_t), cudaMemcpyDeviceToHost));
// stop_event will have been recorded but including the synchronize here to
// prevent copy/paste errors!
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("Expected time for serial execution of %d sets of kernels is between "
"approx. %.3fs and %.3fs\n",
nstreams,
(nstreams + 1) * kernel_time / 1000.0f,
2 * nstreams * kernel_time / 1000.0f);
printf("Expected time for fully concurrent execution of %d sets of kernels is "
"approx. %.3fs\n",
nstreams,
2 * kernel_time / 1000.0f);
printf("Measured time for sample = %.3fs\n", elapsed_time / 1000.0f);
bool bTestResult = (a[0] >= total_clocks);
// Release resources
for (int i = 0; i < nstreams; i++) {
cudaStreamDestroy(streams[i]);
}
free(streams);
cudaEventDestroy(start_event);
cudaEventDestroy(stop_event);
cudaFreeHost(a);
cudaFree(d_a);
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -32,6 +32,7 @@
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <vector> #include <vector>
#include "helper_cuda.h" #include "helper_cuda.h"
#include "helper_multiprocess.h" #include "helper_multiprocess.h"
static const char shmName[] = "simpleIPCshm"; static const char shmName[] = "simpleIPCshm";
@ -39,7 +40,7 @@ static const char shmName[] = "simpleIPCshm";
// For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited // For NVSWITCH connected peers like DGX-2, simultaneous peers are not limited
// in the same way. // in the same way.
#define MAX_DEVICES (32) #define MAX_DEVICES (32)
#define DATA_SIZE (64ULL << 20ULL) // 64MB #define DATA_SIZE (64ULL << 20ULL) // 64MB
#if defined(__linux__) #if defined(__linux__)
#define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x) #define cpu_atomic_add32(a, x) __sync_add_and_fetch(a, x)
@ -49,281 +50,280 @@ static const char shmName[] = "simpleIPCshm";
#error Unsupported system #error Unsupported system
#endif #endif
typedef struct shmStruct_st { typedef struct shmStruct_st
size_t nprocesses; {
int barrier; size_t nprocesses;
int sense; int barrier;
int devices[MAX_DEVICES]; int sense;
cudaIpcMemHandle_t memHandle[MAX_DEVICES]; int devices[MAX_DEVICES];
cudaIpcEventHandle_t eventHandle[MAX_DEVICES]; cudaIpcMemHandle_t memHandle[MAX_DEVICES];
cudaIpcEventHandle_t eventHandle[MAX_DEVICES];
} shmStruct; } shmStruct;
__global__ void simpleKernel(char *ptr, int sz, char val) { __global__ void simpleKernel(char *ptr, int sz, char val)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
for (; idx < sz; idx += (gridDim.x * blockDim.x)) { int idx = blockIdx.x * blockDim.x + threadIdx.x;
ptr[idx] = val; for (; idx < sz; idx += (gridDim.x * blockDim.x)) {
} ptr[idx] = val;
}
} }
static void barrierWait(volatile int *barrier, volatile int *sense, static void barrierWait(volatile int *barrier, volatile int *sense, unsigned int n)
unsigned int n) { {
int count; int count;
// Check-in // Check-in
count = cpu_atomic_add32(barrier, 1); count = cpu_atomic_add32(barrier, 1);
if (count == n) // Last one in if (count == n) // Last one in
*sense = 1; *sense = 1;
while (!*sense) while (!*sense)
; ;
// Check-out // Check-out
count = cpu_atomic_add32(barrier, -1); count = cpu_atomic_add32(barrier, -1);
if (count == 0) // Last one out if (count == 0) // Last one out
*sense = 0; *sense = 0;
while (*sense) while (*sense)
; ;
} }
static void childProcess(int id) { static void childProcess(int id)
volatile shmStruct *shm = NULL; {
cudaStream_t stream; volatile shmStruct *shm = NULL;
sharedMemoryInfo info; cudaStream_t stream;
size_t procCount, i; sharedMemoryInfo info;
int blocks = 0; size_t procCount, i;
int threads = 128; int blocks = 0;
cudaDeviceProp prop; int threads = 128;
std::vector<void *> ptrs; cudaDeviceProp prop;
std::vector<cudaEvent_t> events; std::vector<void *> ptrs;
std::vector<char> verification_buffer(DATA_SIZE); std::vector<cudaEvent_t> events;
std::vector<char> verification_buffer(DATA_SIZE);
if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) { if (sharedMemoryOpen(shmName, sizeof(shmStruct), &info) != 0) {
printf("Failed to create shared memory slab\n"); printf("Failed to create shared memory slab\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
shm = (volatile shmStruct *)info.addr;
procCount = shm->nprocesses;
printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
checkCudaErrors(cudaSetDevice(shm->devices[id]));
checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks, simpleKernel, threads, 0));
blocks *= prop.multiProcessorCount;
// Open and track all the allocations and events created in the master
// process for use later
for (i = 0; i < procCount; i++) {
void *ptr = NULL;
cudaEvent_t event;
// Notice, we don't need to explicitly enable peer access for
// allocations on other devices.
checkCudaErrors(
cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i],
cudaIpcMemLazyEnablePeerAccess));
checkCudaErrors(cudaIpcOpenEventHandle(
&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
ptrs.push_back(ptr);
events.push_back(event);
}
// At each iteration of the loop, each sibling process will push work on
// their respective devices accessing the next peer mapped buffer allocated
// by the master process (these can come from other sibling processes as
// well). To coordinate each process' access, we force the stream to wait for
// the work already accessing this buffer asynchronously through IPC events,
// allowing the CPU processes to continue to queue more work.
for (i = 0; i < procCount; i++) {
size_t bufferId = (i + id) % procCount;
// Wait for the buffer to be accessed to be ready
checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
// Push a simple kernel on it
simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId],
DATA_SIZE, id);
checkCudaErrors(cudaGetLastError());
// Signal that this buffer is ready for the next consumer
checkCudaErrors(cudaEventRecord(events[bufferId], stream));
// Wait for all my sibling processes to push this stage of their work
// before proceeding to the next. This prevents siblings from racing
// ahead and clobbering the recorded event or waiting on the wrong
// recorded event.
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
if (id == 0) {
printf("Step %lld done\n", (unsigned long long)i);
} }
} shm = (volatile shmStruct *)info.addr;
procCount = shm->nprocesses;
// Now wait for my buffer to be ready so I can copy it locally and verify it printf("Process %d: Starting on device %d...\n", id, shm->devices[id]);
checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE,
cudaMemcpyDeviceToHost, stream));
// And wait for all the queued up work to complete
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Process %d: verifying...\n", id); checkCudaErrors(cudaSetDevice(shm->devices[id]));
checkCudaErrors(cudaGetDeviceProperties(&prop, shm->devices[id]));
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&blocks, simpleKernel, threads, 0));
blocks *= prop.multiProcessorCount;
// The contents should have the id of the sibling just after me // Open and track all the allocations and events created in the master
char compareId = (char)((id + 1) % procCount); // process for use later
for (unsigned long long j = 0; j < DATA_SIZE; j++) { for (i = 0; i < procCount; i++) {
if (verification_buffer[j] != compareId) { void *ptr = NULL;
printf("Process %d: Verification mismatch at %lld: %d != %d\n", id, j, cudaEvent_t event;
(int)verification_buffer[j], (int)compareId);
// Notice, we don't need to explicitly enable peer access for
// allocations on other devices.
checkCudaErrors(
cudaIpcOpenMemHandle(&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle[i], cudaIpcMemLazyEnablePeerAccess));
checkCudaErrors(cudaIpcOpenEventHandle(&event, *(cudaIpcEventHandle_t *)&shm->eventHandle[i]));
ptrs.push_back(ptr);
events.push_back(event);
} }
}
// Clean up! // At each iteration of the loop, each sibling process will push work on
for (i = 0; i < procCount; i++) { // their respective devices accessing the next peer mapped buffer allocated
checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i])); // by the master process (these can come from other sibling processes as
checkCudaErrors(cudaEventDestroy(events[i])); // well). To coordinate each process' access, we force the stream to wait for
} // the work already accessing this buffer asynchronously through IPC events,
// allowing the CPU processes to continue to queue more work.
for (i = 0; i < procCount; i++) {
size_t bufferId = (i + id) % procCount;
// Wait for the buffer to be accessed to be ready
checkCudaErrors(cudaStreamWaitEvent(stream, events[bufferId], 0));
// Push a simple kernel on it
simpleKernel<<<blocks, threads, 0, stream>>>((char *)ptrs[bufferId], DATA_SIZE, id);
checkCudaErrors(cudaGetLastError());
// Signal that this buffer is ready for the next consumer
checkCudaErrors(cudaEventRecord(events[bufferId], stream));
// Wait for all my sibling processes to push this stage of their work
// before proceeding to the next. This prevents siblings from racing
// ahead and clobbering the recorded event or waiting on the wrong
// recorded event.
barrierWait(&shm->barrier, &shm->sense, (unsigned int)procCount);
if (id == 0) {
printf("Step %lld done\n", (unsigned long long)i);
}
}
checkCudaErrors(cudaStreamDestroy(stream)); // Now wait for my buffer to be ready so I can copy it locally and verify it
checkCudaErrors(cudaStreamWaitEvent(stream, events[id], 0));
checkCudaErrors(cudaMemcpyAsync(&verification_buffer[0], ptrs[id], DATA_SIZE, cudaMemcpyDeviceToHost, stream));
// And wait for all the queued up work to complete
checkCudaErrors(cudaStreamSynchronize(stream));
printf("Process %d complete!\n", id); printf("Process %d: verifying...\n", id);
// The contents should have the id of the sibling just after me
char compareId = (char)((id + 1) % procCount);
for (unsigned long long j = 0; j < DATA_SIZE; j++) {
if (verification_buffer[j] != compareId) {
printf("Process %d: Verification mismatch at %lld: %d != %d\n",
id,
j,
(int)verification_buffer[j],
(int)compareId);
}
}
// Clean up!
for (i = 0; i < procCount; i++) {
checkCudaErrors(cudaIpcCloseMemHandle(ptrs[i]));
checkCudaErrors(cudaEventDestroy(events[i]));
}
checkCudaErrors(cudaStreamDestroy(stream));
printf("Process %d complete!\n", id);
} }
static void parentProcess(char *app) { static void parentProcess(char *app)
sharedMemoryInfo info; {
int devCount, i; sharedMemoryInfo info;
volatile shmStruct *shm = NULL; int devCount, i;
std::vector<void *> ptrs; volatile shmStruct *shm = NULL;
std::vector<cudaEvent_t> events; std::vector<void *> ptrs;
std::vector<Process> processes; std::vector<cudaEvent_t> events;
std::vector<Process> processes;
checkCudaErrors(cudaGetDeviceCount(&devCount)); checkCudaErrors(cudaGetDeviceCount(&devCount));
if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) { if (sharedMemoryCreate(shmName, sizeof(*shm), &info) != 0) {
printf("Failed to create shared memory slab\n"); printf("Failed to create shared memory slab\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
shm = (volatile shmStruct *)info.addr;
memset((void *)shm, 0, sizeof(*shm));
// Pick all the devices that can access each other's memory for this test
// Keep in mind that CUDA has minimal support for fork() without a
// corresponding exec() in the child process, but in this case our
// spawnProcess will always exec, so no need to worry.
for (i = 0; i < devCount; i++) {
bool allPeers = true;
cudaDeviceProp prop;
checkCudaErrors(cudaGetDeviceProperties(&prop, i));
// CUDA IPC is only supported on devices with unified addressing
if (!prop.unifiedAddressing) {
printf("Device %d does not support unified addressing, skipping...\n", i);
continue;
} }
// This sample requires two processes accessing each device, so we need shm = (volatile shmStruct *)info.addr;
// to ensure exclusive or prohibited mode is not set memset((void *)shm, 0, sizeof(*shm));
if (prop.computeMode != cudaComputeModeDefault) {
printf("Device %d is in an unsupported compute mode for this sample\n", // Pick all the devices that can access each other's memory for this test
i); // Keep in mind that CUDA has minimal support for fork() without a
continue; // corresponding exec() in the child process, but in this case our
// spawnProcess will always exec, so no need to worry.
for (i = 0; i < devCount; i++) {
bool allPeers = true;
cudaDeviceProp prop;
checkCudaErrors(cudaGetDeviceProperties(&prop, i));
// CUDA IPC is only supported on devices with unified addressing
if (!prop.unifiedAddressing) {
printf("Device %d does not support unified addressing, skipping...\n", i);
continue;
}
// This sample requires two processes accessing each device, so we need
// to ensure exclusive or prohibited mode is not set
if (prop.computeMode != cudaComputeModeDefault) {
printf("Device %d is in an unsupported compute mode for this sample\n", i);
continue;
}
for (int j = 0; j < shm->nprocesses; j++) {
int canAccessPeerIJ, canAccessPeerJI;
checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
checkCudaErrors(cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
if (!canAccessPeerIJ || !canAccessPeerJI) {
allPeers = false;
break;
}
}
if (allPeers) {
// Enable peers here. This isn't necessary for IPC, but it will
// setup the peers for the device. For systems that only allow 8
// peers per GPU at a time, this acts to remove devices from CanAccessPeer
for (int j = 0; j < shm->nprocesses; j++) {
checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
checkCudaErrors(cudaSetDevice(shm->devices[j]));
checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
}
shm->devices[shm->nprocesses++] = i;
if (shm->nprocesses >= MAX_DEVICES)
break;
}
else {
printf("Device %d is not peer capable with some other selected peers, "
"skipping\n",
i);
}
} }
for (int j = 0; j < shm->nprocesses; j++) { if (shm->nprocesses == 0) {
int canAccessPeerIJ, canAccessPeerJI; printf("No CUDA devices support IPC\n");
checkCudaErrors( exit(EXIT_WAIVED);
cudaDeviceCanAccessPeer(&canAccessPeerJI, shm->devices[j], i));
checkCudaErrors(
cudaDeviceCanAccessPeer(&canAccessPeerIJ, i, shm->devices[j]));
if (!canAccessPeerIJ || !canAccessPeerJI) {
allPeers = false;
break;
}
}
if (allPeers) {
// Enable peers here. This isn't necessary for IPC, but it will
// setup the peers for the device. For systems that only allow 8
// peers per GPU at a time, this acts to remove devices from CanAccessPeer
for (int j = 0; j < shm->nprocesses; j++) {
checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaDeviceEnablePeerAccess(shm->devices[j], 0));
checkCudaErrors(cudaSetDevice(shm->devices[j]));
checkCudaErrors(cudaDeviceEnablePeerAccess(i, 0));
}
shm->devices[shm->nprocesses++] = i;
if (shm->nprocesses >= MAX_DEVICES) break;
} else {
printf(
"Device %d is not peer capable with some other selected peers, "
"skipping\n",
i);
}
}
if (shm->nprocesses == 0) {
printf("No CUDA devices support IPC\n");
exit(EXIT_WAIVED);
}
// Now allocate memory and an event for each process and fill the shared
// memory buffer with the IPC handles to communicate
for (i = 0; i < shm->nprocesses; i++) {
void *ptr = NULL;
cudaEvent_t event;
checkCudaErrors(cudaSetDevice(shm->devices[i]));
checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
checkCudaErrors(
cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
checkCudaErrors(cudaEventCreate(
&event, cudaEventDisableTiming | cudaEventInterprocess));
checkCudaErrors(cudaIpcGetEventHandle(
(cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
ptrs.push_back(ptr);
events.push_back(event);
}
// Launch the child processes!
for (i = 0; i < shm->nprocesses; i++) {
char devIdx[12]; // Increased size to ensure enough space for formatted integer
char *const args[] = {app, devIdx, NULL};
Process process;
snprintf(devIdx, sizeof(devIdx), "%d", i);
if (spawnProcess(&process, app, args)) {
printf("Failed to create process\n");
exit(EXIT_FAILURE);
} }
processes.push_back(process); // Now allocate memory and an event for each process and fill the shared
} // memory buffer with the IPC handles to communicate
for (i = 0; i < shm->nprocesses; i++) {
void *ptr = NULL;
cudaEvent_t event;
// And wait for them to finish checkCudaErrors(cudaSetDevice(shm->devices[i]));
for (i = 0; i < processes.size(); i++) { checkCudaErrors(cudaMalloc(&ptr, DATA_SIZE));
if (waitProcess(&processes[i]) != EXIT_SUCCESS) { checkCudaErrors(cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle[i], ptr));
printf("Process %d failed!\n", i); checkCudaErrors(cudaEventCreate(&event, cudaEventDisableTiming | cudaEventInterprocess));
exit(EXIT_FAILURE); checkCudaErrors(cudaIpcGetEventHandle((cudaIpcEventHandle_t *)&shm->eventHandle[i], event));
ptrs.push_back(ptr);
events.push_back(event);
} }
}
// Clean up! // Launch the child processes!
for (i = 0; i < shm->nprocesses; i++) { for (i = 0; i < shm->nprocesses; i++) {
checkCudaErrors(cudaSetDevice(shm->devices[i])); char devIdx[12]; // Increased size to ensure enough space for formatted integer
checkCudaErrors(cudaEventSynchronize(events[i])); char *const args[] = {app, devIdx, NULL};
checkCudaErrors(cudaEventDestroy(events[i])); Process process;
checkCudaErrors(cudaFree(ptrs[i]));
}
sharedMemoryClose(&info); snprintf(devIdx, sizeof(devIdx), "%d", i);
if (spawnProcess(&process, app, args)) {
printf("Failed to create process\n");
exit(EXIT_FAILURE);
}
processes.push_back(process);
}
// And wait for them to finish
for (i = 0; i < processes.size(); i++) {
if (waitProcess(&processes[i]) != EXIT_SUCCESS) {
printf("Process %d failed!\n", i);
exit(EXIT_FAILURE);
}
}
// Clean up!
for (i = 0; i < shm->nprocesses; i++) {
checkCudaErrors(cudaSetDevice(shm->devices[i]));
checkCudaErrors(cudaEventSynchronize(events[i]));
checkCudaErrors(cudaEventDestroy(events[i]));
checkCudaErrors(cudaFree(ptrs[i]));
}
sharedMemoryClose(&info);
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
{
#if defined(__arm__) || defined(__aarch64__) #if defined(__arm__) || defined(__aarch64__)
printf("Not supported on ARM\n"); printf("Not supported on ARM\n");
return EXIT_WAIVED; return EXIT_WAIVED;
#else #else
if (argc == 1) { if (argc == 1) {
parentProcess(argv[0]); parentProcess(argv[0]);
} else { }
childProcess(atoi(argv[1])); else {
} childProcess(atoi(argv[1]));
return EXIT_SUCCESS; }
return EXIT_SUCCESS;
#endif #endif
} }

View File

@ -26,27 +26,27 @@
*/ */
/* /*
* This sample demonstrates how to use texture fetches from layered 2D textures * This sample demonstrates how to use texture fetches from layered 2D textures
* in CUDA C * in CUDA C
* *
* This sample first generates a 3D input data array for the layered texture * This sample first generates a 3D input data array for the layered texture
* and the expected output. Then it starts CUDA C kernels, one for each layer, * and the expected output. Then it starts CUDA C kernels, one for each layer,
* which fetch their layer's texture data (using normalized texture coordinates) * which fetch their layer's texture data (using normalized texture coordinates)
* transform it to the expected output, and write it to a 3D output data array. * transform it to the expected output, and write it to a 3D output data array.
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, kernels // includes, kernels
#include <cuda_runtime.h> #include <cuda_runtime.h>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples #include <helper_functions.h> // helper for shared that are common to CUDA Samples
static const char *sSDKname = "simpleLayeredTexture"; static const char *sSDKname = "simpleLayeredTexture";
@ -54,163 +54,156 @@ static const char *sSDKname = "simpleLayeredTexture";
//! Transform a layer of a layered 2D texture using texture lookups //! Transform a layer of a layered 2D texture using texture lookups
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *g_odata, int width, int height, __global__ void transformKernel(float *g_odata, int width, int height, int layer, cudaTextureObject_t tex)
int layer, cudaTextureObject_t tex) { {
// calculate this thread's data point // calculate this thread's data point
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// 0.5f offset and division are necessary to access the original data points // 0.5f offset and division are necessary to access the original data points
// in the texture (such that bilinear interpolation will not be activated). // in the texture (such that bilinear interpolation will not be activated).
// For details, see also CUDA Programming Guide, Appendix D // For details, see also CUDA Programming Guide, Appendix D
float u = (x + 0.5f) / (float)width; float u = (x + 0.5f) / (float)width;
float v = (y + 0.5f) / (float)height; float v = (y + 0.5f) / (float)height;
// read from texture, do expected transformation and write to global memory // read from texture, do expected transformation and write to global memory
g_odata[layer * width * height + y * width + x] = g_odata[layer * width * height + y * width + x] = -tex2DLayered<float>(tex, u, v, layer) + layer;
-tex2DLayered<float>(tex, u, v, layer) + layer;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("[%s] - Starting...\n", sSDKname); {
printf("[%s] - Starting...\n", sSDKname);
// use command-line specified CUDA device, otherwise use device with highest // use command-line specified CUDA device, otherwise use device with highest
// Gflops/s // Gflops/s
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
bool bResult = true; bool bResult = true;
// get number of SMs on this GPU // get number of SMs on this GPU
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, printf("CUDA device [%s] has %d Multi-Processors ", deviceProps.name, deviceProps.multiProcessorCount);
deviceProps.multiProcessorCount); printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
printf("SM %d.%d\n", deviceProps.major, deviceProps.minor);
// generate input data for layered texture // generate input data for layered texture
unsigned int width = 512, height = 512, num_layers = 5; unsigned int width = 512, height = 512, num_layers = 5;
unsigned int size = width * height * num_layers * sizeof(float); unsigned int size = width * height * num_layers * sizeof(float);
float *h_data = (float *)malloc(size); float *h_data = (float *)malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++) for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++) { for (int i = 0; i < (int)(width * height); i++) {
h_data[layer * width * height + i] = (float)i; h_data[layer * width * height + i] = (float)i;
}
// this is the expected transformation of the input data (the expected output)
float *h_data_ref = (float *)malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++) {
h_data_ref[layer * width * height + i] = -h_data[layer * width * height + i] + layer;
}
// allocate device memory for result
float *d_data = NULL;
checkCudaErrors(cudaMalloc((void **)&d_data, size));
// allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cu_3darray;
checkCudaErrors(
cudaMalloc3DArray(&cu_3darray, &channelDesc, make_cudaExtent(width, height, num_layers), cudaArrayLayered));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr = make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cu_3darray;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
printf("Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
"8 x 8 threads\n",
width,
height,
dimGrid.x,
dimGrid.y);
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
tex); // warmup (for better timing)
// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer, tex);
// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n", (width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) / 1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side
float *h_odata = (float *)malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
// write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
} }
else {
// this is the expected transformation of the input data (the expected output) printf("Comparing kernel output to expected data\n");
float *h_data_ref = (float *)malloc(size);
for (unsigned int layer = 0; layer < num_layers; layer++)
for (int i = 0; i < (int)(width * height); i++) {
h_data_ref[layer * width * height + i] =
-h_data[layer * width * height + i] + layer;
}
// allocate device memory for result
float *d_data = NULL;
checkCudaErrors(cudaMalloc((void **)&d_data, size));
// allocate array and copy image data
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaArray *cu_3darray;
checkCudaErrors(cudaMalloc3DArray(&cu_3darray, &channelDesc,
make_cudaExtent(width, height, num_layers),
cudaArrayLayered));
cudaMemcpy3DParms myparms = {0};
myparms.srcPos = make_cudaPos(0, 0, 0);
myparms.dstPos = make_cudaPos(0, 0, 0);
myparms.srcPtr =
make_cudaPitchedPtr(h_data, width * sizeof(float), width, height);
myparms.dstArray = cu_3darray;
myparms.extent = make_cudaExtent(width, height, num_layers);
myparms.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&myparms));
cudaTextureObject_t tex;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cu_3darray;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
printf(
"Covering 2D data array of %d x %d: Grid size is %d x %d, each block has "
"8 x 8 threads\n",
width, height, dimGrid.x, dimGrid.y);
transformKernel<<<dimGrid, dimBlock>>>(d_data, width, height, 0,
tex); // warmup (for better timing)
// check if kernel execution generated an error
getLastCudaError("warmup Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// execute the kernel
for (unsigned int layer = 0; layer < num_layers; layer++)
transformKernel<<<dimGrid, dimBlock, 0>>>(d_data, width, height, layer,
tex);
// check if kernel execution generated an error
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer);
printf("Processing time: %.3f msec\n", sdkGetTimerValue(&timer));
printf("%.2f Mtexlookups/sec\n",
(width * height * num_layers / (sdkGetTimerValue(&timer) / 1000.0f) /
1e6));
sdkDeleteTimer(&timer);
// allocate mem for the result on host side
float *h_odata = (float *)malloc(size);
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_data, size, cudaMemcpyDeviceToHost));
// write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
false);
} else {
printf("Comparing kernel output to expected data\n");
#define MIN_EPSILON_ERROR 5e-3f #define MIN_EPSILON_ERROR 5e-3f
bResult = compareData(h_odata, h_data_ref, width * height * num_layers, bResult = compareData(h_odata, h_data_ref, width * height * num_layers, MIN_EPSILON_ERROR, 0.0f);
MIN_EPSILON_ERROR, 0.0f); }
}
// cleanup memory // cleanup memory
free(h_data); free(h_data);
free(h_data_ref); free(h_data_ref);
free(h_odata); free(h_odata);
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(d_data)); checkCudaErrors(cudaFree(d_data));
checkCudaErrors(cudaFreeArray(cu_3darray)); checkCudaErrors(cudaFreeArray(cu_3darray));
exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -26,15 +26,15 @@
*/ */
/* Simple example demonstrating how to use MPI with CUDA /* Simple example demonstrating how to use MPI with CUDA
* *
* Generate some random numbers on one node. * Generate some random numbers on one node.
* Dispatch them to all nodes. * Dispatch them to all nodes.
* Compute their square root on each node's GPU. * Compute their square root on each node's GPU.
* Compute the average of the results using MPI. * Compute the average of the results using MPI.
* *
* simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms * simpleMPI.cpp: main program, compiled with mpicxx on linux/Mac platforms
* on Windows, please download the Microsoft HPC Pack SDK 2008 * on Windows, please download the Microsoft HPC Pack SDK 2008
*/ */
// MPI include // MPI include
#include <mpi.h> #include <mpi.h>
@ -42,87 +42,88 @@
// System includes // System includes
#include <iostream> #include <iostream>
using std::cout;
using std::cerr; using std::cerr;
using std::cout;
using std::endl; using std::endl;
// User include // User include
#include "simpleMPI.h" #include "simpleMPI.h"
// Error handling macros // Error handling macros
#define MPI_CHECK(call) \ #define MPI_CHECK(call) \
if ((call) != MPI_SUCCESS) { \ if ((call) != MPI_SUCCESS) { \
cerr << "MPI error calling \"" #call "\"\n"; \ cerr << "MPI error calling \"" #call "\"\n"; \
my_abort(-1); \ my_abort(-1); \
} }
// Host code // Host code
// No CUDA here, only MPI // No CUDA here, only MPI
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
// Dimensions of the dataset {
int blockSize = 256; // Dimensions of the dataset
int gridSize = 10000; int blockSize = 256;
int dataSizePerNode = gridSize * blockSize; int gridSize = 10000;
int dataSizePerNode = gridSize * blockSize;
// Initialize MPI state // Initialize MPI state
MPI_CHECK(MPI_Init(&argc, &argv)); MPI_CHECK(MPI_Init(&argc, &argv));
// Get our MPI node number and node count // Get our MPI node number and node count
int commSize, commRank; int commSize, commRank;
MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize)); MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &commSize));
MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank)); MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &commRank));
// Generate some random numbers on the root node (node 0) // Generate some random numbers on the root node (node 0)
int dataSizeTotal = dataSizePerNode * commSize; int dataSizeTotal = dataSizePerNode * commSize;
float *dataRoot = NULL; float *dataRoot = NULL;
// Are we the root node? // Are we the root node?
if (commRank == 0) { if (commRank == 0) {
cout << "Running on " << commSize << " nodes" << endl; cout << "Running on " << commSize << " nodes" << endl;
dataRoot = new float[dataSizeTotal]; dataRoot = new float[dataSizeTotal];
initData(dataRoot, dataSizeTotal); initData(dataRoot, dataSizeTotal);
} }
// Allocate a buffer on each node // Allocate a buffer on each node
float *dataNode = new float[dataSizePerNode]; float *dataNode = new float[dataSizePerNode];
// Dispatch a portion of the input data to each node // Dispatch a portion of the input data to each node
MPI_CHECK(MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, MPI_CHECK(
dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD)); MPI_Scatter(dataRoot, dataSizePerNode, MPI_FLOAT, dataNode, dataSizePerNode, MPI_FLOAT, 0, MPI_COMM_WORLD));
if (commRank == 0) { if (commRank == 0) {
// No need for root data any more // No need for root data any more
delete[] dataRoot; delete[] dataRoot;
} }
// On each node, run computation on GPU // On each node, run computation on GPU
computeGPU(dataNode, blockSize, gridSize); computeGPU(dataNode, blockSize, gridSize);
// Reduction to the root node, computing the sum of output elements // Reduction to the root node, computing the sum of output elements
float sumNode = sum(dataNode, dataSizePerNode); float sumNode = sum(dataNode, dataSizePerNode);
float sumRoot; float sumRoot;
MPI_CHECK( MPI_CHECK(MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
MPI_Reduce(&sumNode, &sumRoot, 1, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD));
if (commRank == 0) { if (commRank == 0) {
float average = sumRoot / dataSizeTotal; float average = sumRoot / dataSizeTotal;
cout << "Average of square roots is: " << average << endl; cout << "Average of square roots is: " << average << endl;
} }
// Cleanup // Cleanup
delete[] dataNode; delete[] dataNode;
MPI_CHECK(MPI_Finalize()); MPI_CHECK(MPI_Finalize());
if (commRank == 0) { if (commRank == 0) {
cout << "PASSED\n"; cout << "PASSED\n";
} }
return 0; return 0;
} }
// Shut down MPI cleanly if something goes wrong // Shut down MPI cleanly if something goes wrong
void my_abort(int err) { void my_abort(int err)
cout << "Test FAILED\n"; {
MPI_Abort(MPI_COMM_WORLD, err); cout << "Test FAILED\n";
MPI_Abort(MPI_COMM_WORLD, err);
} }

View File

@ -26,14 +26,14 @@
*/ */
/* Simple example demonstrating how to use MPI with CUDA /* Simple example demonstrating how to use MPI with CUDA
* *
* Generate some random numbers on one node. * Generate some random numbers on one node.
* Dispatch them to all nodes. * Dispatch them to all nodes.
* Compute their square root on each node's GPU. * Compute their square root on each node's GPU.
* Compute the average of the results using MPI. * Compute the average of the results using MPI.
* *
* simpleMPI.cu: GPU part, compiled with nvcc * simpleMPI.cu: GPU part, compiled with nvcc
*/ */
#include <iostream> #include <iostream>
using std::cerr; using std::cerr;
@ -42,61 +42,63 @@ using std::endl;
#include "simpleMPI.h" #include "simpleMPI.h"
// Error handling macro // Error handling macro
#define CUDA_CHECK(call) \ #define CUDA_CHECK(call) \
if ((call) != cudaSuccess) { \ if ((call) != cudaSuccess) { \
cudaError_t err = cudaGetLastError(); \ cudaError_t err = cudaGetLastError(); \
cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \ cerr << "CUDA error calling \"" #call "\", code is " << err << endl; \
my_abort(err); \ my_abort(err); \
} }
// Device code // Device code
// Very simple GPU Kernel that computes square roots of input numbers // Very simple GPU Kernel that computes square roots of input numbers
__global__ void simpleMPIKernel(float *input, float *output) { __global__ void simpleMPIKernel(float *input, float *output)
int tid = blockIdx.x * blockDim.x + threadIdx.x; {
output[tid] = sqrt(input[tid]); int tid = blockIdx.x * blockDim.x + threadIdx.x;
output[tid] = sqrt(input[tid]);
} }
// Initialize an array with random data (between 0 and 1) // Initialize an array with random data (between 0 and 1)
void initData(float *data, int dataSize) { void initData(float *data, int dataSize)
for (int i = 0; i < dataSize; i++) { {
data[i] = (float)rand() / RAND_MAX; for (int i = 0; i < dataSize; i++) {
} data[i] = (float)rand() / RAND_MAX;
}
} }
// CUDA computation on each node // CUDA computation on each node
// No MPI here, only CUDA // No MPI here, only CUDA
void computeGPU(float *hostData, int blockSize, int gridSize) { void computeGPU(float *hostData, int blockSize, int gridSize)
int dataSize = blockSize * gridSize; {
int dataSize = blockSize * gridSize;
// Allocate data on GPU memory // Allocate data on GPU memory
float *deviceInputData = NULL; float *deviceInputData = NULL;
CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float))); CUDA_CHECK(cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float)));
float *deviceOutputData = NULL; float *deviceOutputData = NULL;
CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float))); CUDA_CHECK(cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float)));
// Copy to GPU memory // Copy to GPU memory
CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), CUDA_CHECK(cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice));
cudaMemcpyHostToDevice));
// Run kernel // Run kernel
simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData); simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);
// Copy data back to CPU memory // Copy data back to CPU memory
CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), CUDA_CHECK(cudaMemcpy(hostData, deviceOutputData, dataSize * sizeof(float), cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
// Free GPU memory // Free GPU memory
CUDA_CHECK(cudaFree(deviceInputData)); CUDA_CHECK(cudaFree(deviceInputData));
CUDA_CHECK(cudaFree(deviceOutputData)); CUDA_CHECK(cudaFree(deviceOutputData));
} }
float sum(float *data, int size) { float sum(float *data, int size)
float accum = 0.f; {
float accum = 0.f;
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
accum += data[i]; accum += data[i];
} }
return accum; return accum;
} }

View File

@ -26,19 +26,20 @@
*/ */
/* Simple example demonstrating how to use MPI with CUDA /* Simple example demonstrating how to use MPI with CUDA
* *
* Generate some random numbers on one node. * Generate some random numbers on one node.
* Dispatch them to all nodes. * Dispatch them to all nodes.
* Compute their square root on each node's GPU. * Compute their square root on each node's GPU.
* Compute the average of the results using MPI. * Compute the average of the results using MPI.
* *
* simpleMPI.h: common header file * simpleMPI.h: common header file
*/ */
// Forward declarations // Forward declarations
extern "C" { extern "C"
void initData(float *data, int dataSize); {
void computeGPU(float *hostData, int blockSize, int gridSize); void initData(float *data, int dataSize);
float sum(float *data, int size); void computeGPU(float *hostData, int blockSize, int gridSize);
void my_abort(int err); float sum(float *data, int size);
void my_abort(int err);
} }

View File

@ -38,7 +38,7 @@
* *
* Elapsed times are averaged over nreps repetitions (10 by default). * Elapsed times are averaged over nreps repetitions (10 by default).
* *
*/ */
const char *sSDKname = "simpleMultiCopy"; const char *sSDKname = "simpleMultiCopy";
@ -50,25 +50,26 @@ const char *sSDKname = "simpleMultiCopy";
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples #include <helper_functions.h> // helper for shared that are common to CUDA Samples
// includes, kernels // includes, kernels
// Declare the CUDA kernels here and main() code that is needed to launch // Declare the CUDA kernels here and main() code that is needed to launch
// Compute workload on the system // Compute workload on the system
__global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps) { __global__ void incKernel(int *g_out, int *g_in, int N, int inner_reps)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) { if (idx < N) {
for (int i = 0; i < inner_reps; ++i) { for (int i = 0; i < inner_reps; ++i) {
g_out[idx] = g_in[idx] + 1; g_out[idx] = g_in[idx] + 1;
}
} }
}
} }
#define STREAM_COUNT 4 #define STREAM_COUNT 4
// Uncomment to simulate data source/sink IO times // Uncomment to simulate data source/sink IO times
//#define SIMULATE_IO // #define SIMULATE_IO
int *h_data_source; int *h_data_source;
int *h_data_sink; int *h_data_sink;
@ -79,13 +80,13 @@ int *d_data_in[STREAM_COUNT];
int *h_data_out[STREAM_COUNT]; int *h_data_out[STREAM_COUNT];
int *d_data_out[STREAM_COUNT]; int *d_data_out[STREAM_COUNT];
cudaEvent_t cycleDone[STREAM_COUNT]; cudaEvent_t cycleDone[STREAM_COUNT];
cudaStream_t stream[STREAM_COUNT]; cudaStream_t stream[STREAM_COUNT];
cudaEvent_t start, stop; cudaEvent_t start, stop;
int N = 1 << 22; int N = 1 << 22;
int nreps = 10; // number of times each experiment is repeated int nreps = 10; // number of times each experiment is repeated
int inner_reps = 5; int inner_reps = 5;
int memsize; int memsize;
@ -96,278 +97,268 @@ dim3 grid;
int thread_blocks; int thread_blocks;
float processWithStreams(int streams_used); float processWithStreams(int streams_used);
void init(); void init();
bool test(); bool test();
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char *argv[]) { int main(int argc, char *argv[])
int cuda_device = 0; {
float scale_factor; int cuda_device = 0;
cudaDeviceProp deviceProp; float scale_factor;
cudaDeviceProp deviceProp;
printf("[%s] - Starting...\n", sSDKname); printf("[%s] - Starting...\n", sSDKname);
if (checkCmdLineFlag(argc, (const char **)argv, "device")) { if (checkCmdLineFlag(argc, (const char **)argv, "device")) {
cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device="); cuda_device = getCmdLineArgumentInt(argc, (const char **)argv, "device=");
if (cuda_device < 0) { if (cuda_device < 0) {
printf("Invalid command line parameters\n"); printf("Invalid command line parameters\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
printf("cuda_device = %d\n", cuda_device); else {
cuda_device = gpuDeviceInit(cuda_device); printf("cuda_device = %d\n", cuda_device);
cuda_device = gpuDeviceInit(cuda_device);
if (cuda_device < 0) { if (cuda_device < 0) {
printf("No CUDA Capable devices found, exiting...\n"); printf("No CUDA Capable devices found, exiting...\n");
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
}
} }
} else { else {
// Otherwise pick the device with the highest Gflops/s // Otherwise pick the device with the highest Gflops/s
cuda_device = gpuGetMaxGflopsDeviceId(); cuda_device = gpuGetMaxGflopsDeviceId();
checkCudaErrors(cudaSetDevice(cuda_device)); checkCudaErrors(cudaSetDevice(cuda_device));
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name);
}
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
printf("> Using CUDA device [%d]: %s\n", cuda_device, deviceProp.name); printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n",
} deviceProp.name,
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device)); // Anything that is less than 32 Cores will have scaled down workload
printf("[%s] has %d MP(s) x %d (Cores/MP) = %d (Cores)\n", deviceProp.name, scale_factor =
deviceProp.multiProcessorCount, max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), 1.0f);
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * N = (int)((float)N / scale_factor);
deviceProp.multiProcessorCount);
// Anything that is less than 32 Cores will have scaled down workload printf("> Device name: %s\n", deviceProp.name);
scale_factor = printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.major,
(float)deviceProp.multiProcessorCount)), deviceProp.minor,
1.0f); deviceProp.multiProcessorCount);
N = (int)((float)N / scale_factor); printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
printf("> array_size = %d\n\n", N);
printf("> Device name: %s\n", deviceProp.name); memsize = N * sizeof(int);
printf("> CUDA Capability %d.%d hardware with %d multi-processors\n",
deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
printf("> scale_factor = %.2f\n", 1.0f / scale_factor);
printf("> array_size = %d\n\n", N);
memsize = N * sizeof(int); thread_blocks = N / block.x;
thread_blocks = N / block.x; grid.x = thread_blocks % 65535;
grid.y = (thread_blocks / 65535 + 1);
grid.x = thread_blocks % 65535; // Allocate resources
grid.y = (thread_blocks / 65535 + 1);
// Allocate resources h_data_source = (int *)malloc(memsize);
h_data_sink = (int *)malloc(memsize);
h_data_source = (int *)malloc(memsize); for (int i = 0; i < STREAM_COUNT; ++i) {
h_data_sink = (int *)malloc(memsize); checkCudaErrors(cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
for (int i = 0; i < STREAM_COUNT; ++i) { checkCudaErrors(cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault));
checkCudaErrors( checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&d_data_in[i], memsize));
checkCudaErrors(cudaMemset(d_data_in[i], 0, memsize));
checkCudaErrors( checkCudaErrors(cudaStreamCreate(&stream[i]));
cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault)); checkCudaErrors(cudaEventCreate(&cycleDone[i]));
checkCudaErrors(cudaMalloc(&d_data_out[i], memsize));
checkCudaErrors(cudaStreamCreate(&stream[i])); cudaEventRecord(cycleDone[i], stream[i]);
checkCudaErrors(cudaEventCreate(&cycleDone[i])); }
cudaEventRecord(cycleDone[i], stream[i]); cudaEventCreate(&start);
} cudaEventCreate(&stop);
cudaEventCreate(&start); init();
cudaEventCreate(&stop);
init(); // Kernel warmup
incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps);
// Kernel warmup // Time copies and kernel
incKernel<<<grid, block>>>(d_data_out[0], d_data_in[0], N, inner_reps); cudaEventRecord(start, 0);
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice, 0));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// Time copies and kernel float memcpy_h2d_time;
cudaEventRecord(start, 0); cudaEventElapsedTime(&memcpy_h2d_time, start, stop);
checkCudaErrors(cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize,
cudaMemcpyHostToDevice, 0));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float memcpy_h2d_time; cudaEventRecord(start, 0);
cudaEventElapsedTime(&memcpy_h2d_time, start, stop); checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventRecord(start, 0); float memcpy_d2h_time;
checkCudaErrors(cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaEventElapsedTime(&memcpy_d2h_time, start, stop);
cudaMemcpyDeviceToHost, 0));
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float memcpy_d2h_time; cudaEventRecord(start, 0);
cudaEventElapsedTime(&memcpy_d2h_time, start, stop); incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventRecord(start, 0); float kernel_time;
incKernel<<<grid, block, 0, 0>>>(d_data_out[0], d_data_in[0], N, inner_reps); cudaEventElapsedTime(&kernel_time, start, stop);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float kernel_time; printf("\n");
cudaEventElapsedTime(&kernel_time, start, stop); printf("Relevant properties of this CUDA device\n");
printf("(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution "
"(device property \"deviceOverlap\")\n",
deviceProp.deviceOverlap ? "X" : " ");
// printf("(%s) Can execute several GPU kernels simultaneously (compute
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
printf("(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
"4000/5000/6000/K5000)\n",
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
printf("\n"); printf("\n");
printf("Relevant properties of this CUDA device\n"); printf("Measured timings (throughput):\n");
printf( printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (memsize * 1e-6) / memcpy_h2d_time);
"(%s) Can overlap one CPU<>GPU data transfer with GPU kernel execution " printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (memsize * 1e-6) / memcpy_d2h_time);
"(device property \"deviceOverlap\")\n", printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, (inner_reps * memsize * 2e-6) / kernel_time);
deviceProp.deviceOverlap ? "X" : " ");
// printf("(%s) Can execute several GPU kernels simultaneously (compute
// capability >= 2.0)\n", deviceProp.major >= 2 ? "X": " ");
printf(
"(%s) Can overlap two CPU<>GPU data transfers with GPU kernel execution\n"
" (Compute Capability >= 2.0 AND (Tesla product OR Quadro "
"4000/5000/6000/K5000)\n",
(deviceProp.major >= 2 && deviceProp.asyncEngineCount > 1) ? "X" : " ");
printf("\n"); printf("\n");
printf("Measured timings (throughput):\n"); printf("Theoretical limits for speedup gained from overlapped data "
printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, "transfers:\n");
(memsize * 1e-6) / memcpy_h2d_time); printf("No overlap at all (transfer-kernel-transfer): %f ms \n", memcpy_h2d_time + memcpy_d2h_time + kernel_time);
printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, printf("Compute can overlap with one transfer: %f ms\n", max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
(memsize * 1e-6) / memcpy_d2h_time); printf("Compute can overlap with both data transfers: %f ms\n",
printf(" Kernel\t\t\t: %f ms (%f GB/s)\n", kernel_time, max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
(inner_reps * memsize * 2e-6) / kernel_time);
printf("\n"); // Process pipelined work
printf( float serial_time = processWithStreams(1);
"Theoretical limits for speedup gained from overlapped data " float overlap_time = processWithStreams(STREAM_COUNT);
"transfers:\n");
printf("No overlap at all (transfer-kernel-transfer): %f ms \n",
memcpy_h2d_time + memcpy_d2h_time + kernel_time);
printf("Compute can overlap with one transfer: %f ms\n",
max((memcpy_h2d_time + memcpy_d2h_time), kernel_time));
printf("Compute can overlap with both data transfers: %f ms\n",
max(max(memcpy_h2d_time, memcpy_d2h_time), kernel_time));
// Process pipelined work printf("\nAverage measured timings over %d repetitions:\n", nreps);
float serial_time = processWithStreams(1); printf(" Avg. time when execution fully serialized\t: %f ms\n", serial_time / nreps);
float overlap_time = processWithStreams(STREAM_COUNT); printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT, overlap_time / nreps);
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n", (serial_time - overlap_time) / nreps);
printf("\nAverage measured timings over %d repetitions:\n", nreps); printf("\nMeasured throughput:\n");
printf(" Avg. time when execution fully serialized\t: %f ms\n", printf(" Fully serialized execution\t\t: %f GB/s\n", (nreps * (memsize * 2e-6)) / serial_time);
serial_time / nreps); printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT, (nreps * (memsize * 2e-6)) / overlap_time);
printf(" Avg. time when overlapped using %d streams\t: %f ms\n", STREAM_COUNT,
overlap_time / nreps);
printf(" Avg. speedup gained (serialized - overlapped)\t: %f ms\n",
(serial_time - overlap_time) / nreps);
printf("\nMeasured throughput:\n"); // Verify the results, we will use the results for final output
printf(" Fully serialized execution\t\t: %f GB/s\n", bool bResults = test();
(nreps * (memsize * 2e-6)) / serial_time);
printf(" Overlapped using %d streams\t\t: %f GB/s\n", STREAM_COUNT,
(nreps * (memsize * 2e-6)) / overlap_time);
// Verify the results, we will use the results for final output // Free resources
bool bResults = test();
// Free resources free(h_data_source);
free(h_data_sink);
free(h_data_source); for (int i = 0; i < STREAM_COUNT; ++i) {
free(h_data_sink); cudaFreeHost(h_data_in[i]);
cudaFree(d_data_in[i]);
for (int i = 0; i < STREAM_COUNT; ++i) { cudaFreeHost(h_data_out[i]);
cudaFreeHost(h_data_in[i]); cudaFree(d_data_out[i]);
cudaFree(d_data_in[i]);
cudaFreeHost(h_data_out[i]); cudaStreamDestroy(stream[i]);
cudaFree(d_data_out[i]); cudaEventDestroy(cycleDone[i]);
}
cudaStreamDestroy(stream[i]); cudaEventDestroy(start);
cudaEventDestroy(cycleDone[i]); cudaEventDestroy(stop);
}
cudaEventDestroy(start); // Test result
cudaEventDestroy(stop); exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
// Test result
exit(bResults ? EXIT_SUCCESS : EXIT_FAILURE);
} }
float processWithStreams(int streams_used) { float processWithStreams(int streams_used)
int current_stream = 0; {
int current_stream = 0;
float time; float time;
// Do processing in a loop // Do processing in a loop
// //
// Note: All memory commands are processed in the order they are issued, // Note: All memory commands are processed in the order they are issued,
// independent of the stream they are enqueued in. Hence the pattern by // independent of the stream they are enqueued in. Hence the pattern by
// which the copy and kernel commands are enqueued in the stream // which the copy and kernel commands are enqueued in the stream
// has an influence on the achieved overlap. // has an influence on the achieved overlap.
cudaEventRecord(start, 0); cudaEventRecord(start, 0);
for (int i = 0; i < nreps; ++i) { for (int i = 0; i < nreps; ++i) {
int next_stream = (current_stream + 1) % streams_used; int next_stream = (current_stream + 1) % streams_used;
#ifdef SIMULATE_IO #ifdef SIMULATE_IO
// Store the result // Store the result
memcpy(h_data_sink, h_data_out[current_stream], memsize); memcpy(h_data_sink, h_data_out[current_stream], memsize);
// Read new input // Read new input
memcpy(h_data_in[next_stream], h_data_source, memsize); memcpy(h_data_in[next_stream], h_data_source, memsize);
#endif #endif
// Ensure that processing and copying of the last cycle has finished // Ensure that processing and copying of the last cycle has finished
cudaEventSynchronize(cycleDone[next_stream]); cudaEventSynchronize(cycleDone[next_stream]);
// Process current frame // Process current frame
incKernel<<<grid, block, 0, stream[current_stream]>>>( incKernel<<<grid, block, 0, stream[current_stream]>>>(
d_data_out[current_stream], d_data_in[current_stream], N, inner_reps); d_data_out[current_stream], d_data_in[current_stream], N, inner_reps);
// Upload next frame // Upload next frame
checkCudaErrors( checkCudaErrors(cudaMemcpyAsync(
cudaMemcpyAsync(d_data_in[next_stream], h_data_in[next_stream], memsize, d_data_in[next_stream], h_data_in[next_stream], memsize, cudaMemcpyHostToDevice, stream[next_stream]));
cudaMemcpyHostToDevice, stream[next_stream]));
// Download current frame // Download current frame
checkCudaErrors(cudaMemcpyAsync( checkCudaErrors(cudaMemcpyAsync(h_data_out[current_stream],
h_data_out[current_stream], d_data_out[current_stream], memsize, d_data_out[current_stream],
cudaMemcpyDeviceToHost, stream[current_stream])); memsize,
cudaMemcpyDeviceToHost,
stream[current_stream]));
checkCudaErrors( checkCudaErrors(cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
cudaEventRecord(cycleDone[current_stream], stream[current_stream]));
current_stream = next_stream; current_stream = next_stream;
}
cudaEventRecord(stop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime(&time, start, stop);
return time;
}
void init() {
for (int i = 0; i < N; ++i) {
h_data_source[i] = 0;
}
for (int i = 0; i < STREAM_COUNT; ++i) {
memcpy(h_data_in[i], h_data_source, memsize);
}
}
bool test() {
bool passed = true;
for (int j = 0; j < STREAM_COUNT; ++j) {
for (int i = 0; i < N; ++i) {
passed &= (h_data_out[j][i] == 1);
} }
}
return passed; cudaEventRecord(stop, 0);
cudaDeviceSynchronize();
cudaEventElapsedTime(&time, start, stop);
return time;
}
void init()
{
for (int i = 0; i < N; ++i) {
h_data_source[i] = 0;
}
for (int i = 0; i < STREAM_COUNT; ++i) {
memcpy(h_data_in[i], h_data_source, memsize);
}
}
bool test()
{
bool passed = true;
for (int j = 0; j < STREAM_COUNT; ++j) {
for (int i = 0; i < N; ++i) {
passed &= (h_data_out[j][i] == 1);
}
}
return passed;
} }

View File

@ -37,15 +37,15 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
@ -57,180 +57,176 @@
// Data configuration // Data configuration
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
const int MAX_GPU_COUNT = 32; const int MAX_GPU_COUNT = 32;
const int DATA_N = 1048576 * 32; const int DATA_N = 1048576 * 32;
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Simple reduction kernel. // Simple reduction kernel.
// Refer to the 'reduction' CUDA Sample describing // Refer to the 'reduction' CUDA Sample describing
// reduction optimization strategies // reduction optimization strategies
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N) { __global__ static void reduceKernel(float *d_Result, float *d_Input, int N)
const int tid = blockIdx.x * blockDim.x + threadIdx.x; {
const int threadN = gridDim.x * blockDim.x; const int tid = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0; const int threadN = gridDim.x * blockDim.x;
float sum = 0;
for (int pos = tid; pos < N; pos += threadN) sum += d_Input[pos]; for (int pos = tid; pos < N; pos += threadN)
sum += d_Input[pos];
d_Result[tid] = sum; d_Result[tid] = sum;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
// Solver config {
TGPUplan plan[MAX_GPU_COUNT]; // Solver config
TGPUplan plan[MAX_GPU_COUNT];
// GPU reduction results // GPU reduction results
float h_SumGPU[MAX_GPU_COUNT]; float h_SumGPU[MAX_GPU_COUNT];
float sumGPU; float sumGPU;
double sumCPU, diff; double sumCPU, diff;
int i, j, gpuBase, GPU_N; int i, j, gpuBase, GPU_N;
const int BLOCK_N = 32; const int BLOCK_N = 32;
const int THREAD_N = 256; const int THREAD_N = 256;
const int ACCUM_N = BLOCK_N * THREAD_N; const int ACCUM_N = BLOCK_N * THREAD_N;
printf("Starting simpleMultiGPU\n"); printf("Starting simpleMultiGPU\n");
checkCudaErrors(cudaGetDeviceCount(&GPU_N)); checkCudaErrors(cudaGetDeviceCount(&GPU_N));
if (GPU_N > MAX_GPU_COUNT) { if (GPU_N > MAX_GPU_COUNT) {
GPU_N = MAX_GPU_COUNT; GPU_N = MAX_GPU_COUNT;
}
printf("CUDA-capable device count: %i\n", GPU_N);
printf("Generating input data...\n\n");
// Subdividing input data across GPUs
// Get data sizes for each GPU
for (i = 0; i < GPU_N; i++) {
plan[i].dataN = DATA_N / GPU_N;
}
// Take into account "odd" data sizes
for (i = 0; i < DATA_N % GPU_N; i++) {
plan[i].dataN++;
}
// Assign data ranges to GPUs
gpuBase = 0;
for (i = 0; i < GPU_N; i++) {
plan[i].h_Sum = h_SumGPU + i;
gpuBase += plan[i].dataN;
}
// Create streams for issuing GPU command asynchronously and allocate memory
// (GPU and System page-locked)
for (i = 0; i < GPU_N; i++) {
checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaStreamCreate(&plan[i].stream));
// Allocate memory
checkCudaErrors(
cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
checkCudaErrors(
cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device,
ACCUM_N * sizeof(float)));
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data,
plan[i].dataN * sizeof(float)));
for (j = 0; j < plan[i].dataN; j++) {
plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
}
}
// Start timing and compute on GPU(s)
printf("Computing with %d GPUs...\n", GPU_N);
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// Copy data to GPU, launch the kernel and copy data back. All asynchronously
for (i = 0; i < GPU_N; i++) {
// Set device
checkCudaErrors(cudaSetDevice(i));
// Copy input data from CPU
checkCudaErrors(cudaMemcpyAsync(plan[i].d_Data, plan[i].h_Data,
plan[i].dataN * sizeof(float),
cudaMemcpyHostToDevice, plan[i].stream));
// Perform GPU computations
reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(
plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
getLastCudaError("reduceKernel() execution failed.\n");
// Read back GPU results
checkCudaErrors(cudaMemcpyAsync(plan[i].h_Sum_from_device, plan[i].d_Sum,
ACCUM_N * sizeof(float),
cudaMemcpyDeviceToHost, plan[i].stream));
}
// Process GPU results
for (i = 0; i < GPU_N; i++) {
float sum;
// Set device
checkCudaErrors(cudaSetDevice(i));
// Wait for all operations to finish
cudaStreamSynchronize(plan[i].stream);
// Finalize GPU reduction for current subvector
sum = 0;
for (j = 0; j < ACCUM_N; j++) {
sum += plan[i].h_Sum_from_device[j];
} }
*(plan[i].h_Sum) = (float)sum; printf("CUDA-capable device count: %i\n", GPU_N);
// Shut down this GPU printf("Generating input data...\n\n");
checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
checkCudaErrors(cudaFree(plan[i].d_Sum));
checkCudaErrors(cudaFree(plan[i].d_Data));
checkCudaErrors(cudaStreamDestroy(plan[i].stream));
}
sumGPU = 0; // Subdividing input data across GPUs
// Get data sizes for each GPU
for (i = 0; i < GPU_N; i++) { for (i = 0; i < GPU_N; i++) {
sumGPU += h_SumGPU[i]; plan[i].dataN = DATA_N / GPU_N;
}
sdkStopTimer(&timer);
printf(" GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
// Compute on Host CPU
printf("Computing with Host CPU...\n\n");
sumCPU = 0;
for (i = 0; i < GPU_N; i++) {
for (j = 0; j < plan[i].dataN; j++) {
sumCPU += plan[i].h_Data[j];
} }
}
// Compare GPU and CPU results // Take into account "odd" data sizes
printf("Comparing GPU and Host CPU results...\n"); for (i = 0; i < DATA_N % GPU_N; i++) {
diff = fabs(sumCPU - sumGPU) / fabs(sumCPU); plan[i].dataN++;
printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); }
printf(" Relative difference: %E \n\n", diff);
// Cleanup and shutdown // Assign data ranges to GPUs
for (i = 0; i < GPU_N; i++) { gpuBase = 0;
checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaFreeHost(plan[i].h_Data));
}
exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE); for (i = 0; i < GPU_N; i++) {
plan[i].h_Sum = h_SumGPU + i;
gpuBase += plan[i].dataN;
}
// Create streams for issuing GPU command asynchronously and allocate memory
// (GPU and System page-locked)
for (i = 0; i < GPU_N; i++) {
checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaStreamCreate(&plan[i].stream));
// Allocate memory
checkCudaErrors(cudaMalloc((void **)&plan[i].d_Data, plan[i].dataN * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&plan[i].d_Sum, ACCUM_N * sizeof(float)));
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Sum_from_device, ACCUM_N * sizeof(float)));
checkCudaErrors(cudaMallocHost((void **)&plan[i].h_Data, plan[i].dataN * sizeof(float)));
for (j = 0; j < plan[i].dataN; j++) {
plan[i].h_Data[j] = (float)rand() / (float)RAND_MAX;
}
}
// Start timing and compute on GPU(s)
printf("Computing with %d GPUs...\n", GPU_N);
// create and start timer
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
// start the timer
sdkStartTimer(&timer);
// Copy data to GPU, launch the kernel and copy data back. All asynchronously
for (i = 0; i < GPU_N; i++) {
// Set device
checkCudaErrors(cudaSetDevice(i));
// Copy input data from CPU
checkCudaErrors(cudaMemcpyAsync(
plan[i].d_Data, plan[i].h_Data, plan[i].dataN * sizeof(float), cudaMemcpyHostToDevice, plan[i].stream));
// Perform GPU computations
reduceKernel<<<BLOCK_N, THREAD_N, 0, plan[i].stream>>>(plan[i].d_Sum, plan[i].d_Data, plan[i].dataN);
getLastCudaError("reduceKernel() execution failed.\n");
// Read back GPU results
checkCudaErrors(cudaMemcpyAsync(
plan[i].h_Sum_from_device, plan[i].d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost, plan[i].stream));
}
// Process GPU results
for (i = 0; i < GPU_N; i++) {
float sum;
// Set device
checkCudaErrors(cudaSetDevice(i));
// Wait for all operations to finish
cudaStreamSynchronize(plan[i].stream);
// Finalize GPU reduction for current subvector
sum = 0;
for (j = 0; j < ACCUM_N; j++) {
sum += plan[i].h_Sum_from_device[j];
}
*(plan[i].h_Sum) = (float)sum;
// Shut down this GPU
checkCudaErrors(cudaFreeHost(plan[i].h_Sum_from_device));
checkCudaErrors(cudaFree(plan[i].d_Sum));
checkCudaErrors(cudaFree(plan[i].d_Data));
checkCudaErrors(cudaStreamDestroy(plan[i].stream));
}
sumGPU = 0;
for (i = 0; i < GPU_N; i++) {
sumGPU += h_SumGPU[i];
}
sdkStopTimer(&timer);
printf(" GPU Processing time: %f (ms)\n\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer);
// Compute on Host CPU
printf("Computing with Host CPU...\n\n");
sumCPU = 0;
for (i = 0; i < GPU_N; i++) {
for (j = 0; j < plan[i].dataN; j++) {
sumCPU += plan[i].h_Data[j];
}
}
// Compare GPU and CPU results
printf("Comparing GPU and Host CPU results...\n");
diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);
printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU);
printf(" Relative difference: %E \n\n", diff);
// Cleanup and shutdown
for (i = 0; i < GPU_N; i++) {
checkCudaErrors(cudaSetDevice(i));
checkCudaErrors(cudaFreeHost(plan[i].h_Data));
}
exit((diff < 1e-5) ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -37,26 +37,26 @@
#ifndef SIMPLEMULTIGPU_H #ifndef SIMPLEMULTIGPU_H
#define SIMPLEMULTIGPU_H #define SIMPLEMULTIGPU_H
typedef struct { typedef struct
// Host-side input data {
int dataN; // Host-side input data
float *h_Data; int dataN;
float *h_Data;
// Partial sum for this GPU // Partial sum for this GPU
float *h_Sum; float *h_Sum;
// Device buffers // Device buffers
float *d_Data, *d_Sum; float *d_Data, *d_Sum;
// Reduction copied back from GPU // Reduction copied back from GPU
float *h_Sum_from_device; float *h_Sum_from_device;
// Stream for asynchronous command execution // Stream for asynchronous command execution
cudaStream_t stream; cudaStream_t stream;
} TGPUplan; } TGPUplan;
extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, extern "C" void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
int BLOCK_N, int THREAD_N, cudaStream_t &s);
#endif #endif

View File

@ -25,8 +25,8 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/ */
#include <helper_cuda.h> // helper functions for CUDA error check
#include <iostream> #include <iostream>
#include <helper_cuda.h> // helper functions for CUDA error check
const int manualBlockSize = 32; const int manualBlockSize = 32;
@ -38,13 +38,14 @@ const int manualBlockSize = 32;
// execution configuration, including anything the launch configurator // execution configuration, including anything the launch configurator
// API suggests. // API suggests.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void square(int *array, int arrayCount) { __global__ void square(int *array, int arrayCount)
extern __shared__ int dynamicSmem[]; {
int idx = threadIdx.x + blockIdx.x * blockDim.x; extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < arrayCount) { if (idx < arrayCount) {
array[idx] *= array[idx]; array[idx] *= array[idx];
} }
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -58,29 +59,28 @@ __global__ void square(int *array, int arrayCount) {
// This wrapper routine computes the occupancy of kernel, and reports // This wrapper routine computes the occupancy of kernel, and reports
// it in terms of active warps / maximum warps per SM. // it in terms of active warps / maximum warps per SM.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static double reportPotentialOccupancy(void *kernel, int blockSize, static double reportPotentialOccupancy(void *kernel, int blockSize, size_t dynamicSMem)
size_t dynamicSMem) { {
int device; int device;
cudaDeviceProp prop; cudaDeviceProp prop;
int numBlocks; int numBlocks;
int activeWarps; int activeWarps;
int maxWarps; int maxWarps;
double occupancy; double occupancy;
checkCudaErrors(cudaGetDevice(&device)); checkCudaErrors(cudaGetDevice(&device));
checkCudaErrors(cudaGetDeviceProperties(&prop, device)); checkCudaErrors(cudaGetDeviceProperties(&prop, device));
checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor( checkCudaErrors(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, kernel, blockSize, dynamicSMem));
&numBlocks, kernel, blockSize, dynamicSMem));
activeWarps = numBlocks * blockSize / prop.warpSize; activeWarps = numBlocks * blockSize / prop.warpSize;
maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize; maxWarps = prop.maxThreadsPerMultiProcessor / prop.warpSize;
occupancy = (double)activeWarps / maxWarps; occupancy = (double)activeWarps / maxWarps;
return occupancy; return occupancy;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -99,65 +99,63 @@ static double reportPotentialOccupancy(void *kernel, int blockSize,
// This function configures the launch based on the "automatic" // This function configures the launch based on the "automatic"
// argument, records the runtime, and reports occupancy and runtime. // argument, records the runtime, and reports occupancy and runtime.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int launchConfig(int *array, int arrayCount, bool automatic) { static int launchConfig(int *array, int arrayCount, bool automatic)
int blockSize; {
int minGridSize; int blockSize;
int gridSize; int minGridSize;
size_t dynamicSMemUsage = 0; int gridSize;
size_t dynamicSMemUsage = 0;
cudaEvent_t start; cudaEvent_t start;
cudaEvent_t end; cudaEvent_t end;
float elapsedTime; float elapsedTime;
double potentialOccupancy; double potentialOccupancy;
checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&end)); checkCudaErrors(cudaEventCreate(&end));
if (automatic) { if (automatic) {
checkCudaErrors(cudaOccupancyMaxPotentialBlockSize( checkCudaErrors(
&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void *)square, dynamicSMemUsage, arrayCount));
arrayCount));
std::cout << "Suggested block size: " << blockSize << std::endl std::cout << "Suggested block size: " << blockSize << std::endl
<< "Minimum grid size for maximum occupancy: " << minGridSize << "Minimum grid size for maximum occupancy: " << minGridSize << std::endl;
<< std::endl; }
} else { else {
// This block size is too small. Given limited number of // This block size is too small. Given limited number of
// active blocks per multiprocessor, the number of active // active blocks per multiprocessor, the number of active
// threads will be limited, and thus unable to achieve maximum // threads will be limited, and thus unable to achieve maximum
// occupancy. // occupancy.
//
blockSize = manualBlockSize;
}
// Round up
// //
blockSize = manualBlockSize; gridSize = (arrayCount + blockSize - 1) / blockSize;
}
// Round up // Launch and profile
// //
gridSize = (arrayCount + blockSize - 1) / blockSize; checkCudaErrors(cudaEventRecord(start));
square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
checkCudaErrors(cudaEventRecord(end));
// Launch and profile checkCudaErrors(cudaDeviceSynchronize());
//
checkCudaErrors(cudaEventRecord(start));
square<<<gridSize, blockSize, dynamicSMemUsage>>>(array, arrayCount);
checkCudaErrors(cudaEventRecord(end));
checkCudaErrors(cudaDeviceSynchronize()); // Calculate occupancy
//
potentialOccupancy = reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
// Calculate occupancy std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" << std::endl;
//
potentialOccupancy =
reportPotentialOccupancy((void *)square, blockSize, dynamicSMemUsage);
std::cout << "Potential occupancy: " << potentialOccupancy * 100 << "%" // Report elapsed time
<< std::endl; //
checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
// Report elapsed time return 0;
//
checkCudaErrors(cudaEventElapsedTime(&elapsedTime, start, end));
std::cout << "Elapsed time: " << elapsedTime << "ms" << std::endl;
return 0;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -166,41 +164,41 @@ static int launchConfig(int *array, int arrayCount, bool automatic) {
// The test generates an array and squares it with a CUDA kernel, then // The test generates an array and squares it with a CUDA kernel, then
// verifies the result. // verifies the result.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static int test(bool automaticLaunchConfig, const int count = 1000000) { static int test(bool automaticLaunchConfig, const int count = 1000000)
int *array; {
int *dArray; int *array;
int size = count * sizeof(int); int *dArray;
int size = count * sizeof(int);
array = new int[count]; array = new int[count];
for (int i = 0; i < count; i += 1) { for (int i = 0; i < count; i += 1) {
array[i] = i; array[i] = i;
}
checkCudaErrors(cudaMalloc(&dArray, size));
checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
for (int i = 0; i < count; i += 1) {
array[i] = 0;
}
launchConfig(dArray, count, automaticLaunchConfig);
checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dArray));
// Verify the return data
//
for (int i = 0; i < count; i += 1) {
if (array[i] != i * i) {
std::cout << "element " << i << " expected " << i * i << " actual "
<< array[i] << std::endl;
return 1;
} }
}
delete[] array;
return 0; checkCudaErrors(cudaMalloc(&dArray, size));
checkCudaErrors(cudaMemcpy(dArray, array, size, cudaMemcpyHostToDevice));
for (int i = 0; i < count; i += 1) {
array[i] = 0;
}
launchConfig(dArray, count, automaticLaunchConfig);
checkCudaErrors(cudaMemcpy(array, dArray, size, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaFree(dArray));
// Verify the return data
//
for (int i = 0; i < count; i += 1) {
if (array[i] != i * i) {
std::cout << "element " << i << " expected " << i * i << " actual " << array[i] << std::endl;
return 1;
}
}
delete[] array;
return 0;
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -210,31 +208,31 @@ static int test(bool automaticLaunchConfig, const int count = 1000000) {
// automatically configured launch, and reports the occupancy and // automatically configured launch, and reports the occupancy and
// performance. // performance.
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main() { int main()
int status; {
int status;
std::cout << "starting Simple Occupancy" << std::endl << std::endl; std::cout << "starting Simple Occupancy" << std::endl << std::endl;
std::cout << "[ Manual configuration with " << manualBlockSize std::cout << "[ Manual configuration with " << manualBlockSize << " threads per block ]" << std::endl;
<< " threads per block ]" << std::endl;
status = test(false); status = test(false);
if (status) { if (status) {
std::cerr << "Test failed\n" << std::endl; std::cerr << "Test failed\n" << std::endl;
return -1; return -1;
} }
std::cout << std::endl; std::cout << std::endl;
std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl; std::cout << "[ Automatic, occupancy-based configuration ]" << std::endl;
status = test(true); status = test(true);
if (status) { if (status) {
std::cerr << "Test failed\n" << std::endl; std::cerr << "Test failed\n" << std::endl;
return -1; return -1;
} }
std::cout << std::endl; std::cout << std::endl;
std::cout << "Test PASSED\n" << std::endl; std::cout << "Test PASSED\n" << std::endl;
return 0; return 0;
} }

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -31,230 +31,233 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
// CUDA includes // CUDA includes
#include <cuda_runtime.h> #include <cuda_runtime.h>
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples #include <helper_functions.h> // helper for shared that are common to CUDA Samples
__global__ void SimpleKernel(float *src, float *dst) { __global__ void SimpleKernel(float *src, float *dst)
// Just a dummy kernel, doing enough for us to verify that everything {
// worked // Just a dummy kernel, doing enough for us to verify that everything
const int idx = blockIdx.x * blockDim.x + threadIdx.x; // worked
dst[idx] = src[idx] * 2.0f; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
dst[idx] = src[idx] * 2.0f;
} }
inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; } inline bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("[%s] - Starting...\n", argv[0]); {
printf("[%s] - Starting...\n", argv[0]);
if (!IsAppBuiltAs64()) { if (!IsAppBuiltAs64()) {
printf( printf("%s is only supported with on 64-bit OSs and the application must be "
"%s is only supported with on 64-bit OSs and the application must be " "built as a 64-bit target. Test is being waived.\n",
"built as a 64-bit target. Test is being waived.\n", argv[0]);
argv[0]); exit(EXIT_WAIVED);
exit(EXIT_WAIVED);
}
// Number of GPUs
printf("Checking for multiple GPUs...\n");
int gpu_n;
checkCudaErrors(cudaGetDeviceCount(&gpu_n));
printf("CUDA-capable device count: %i\n", gpu_n);
if (gpu_n < 2) {
printf(
"Two or more GPUs with Peer-to-Peer access capability are required for "
"%s.\n",
argv[0]);
printf("Waiving test.\n");
exit(EXIT_WAIVED);
}
// Query device properties
cudaDeviceProp prop[64];
int gpuid[2]; // we want to find the first two GPU's that can support P2P
for (int i = 0; i < gpu_n; i++) {
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
}
// Check possibility for peer access
printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
int can_access_peer;
int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs
p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
// Show all the combinations of supported P2P GPUs
for (int i = 0; i < gpu_n; i++) {
for (int j = 0; j < gpu_n; j++) {
if (i == j) {
continue;
}
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[i].name,
i, prop[j].name, j, can_access_peer ? "Yes" : "No");
if (can_access_peer && p2pCapableGPUs[0] == -1) {
p2pCapableGPUs[0] = i;
p2pCapableGPUs[1] = j;
}
} }
}
if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) { // Number of GPUs
printf( printf("Checking for multiple GPUs...\n");
"Two or more GPUs with Peer-to-Peer access capability are required for " int gpu_n;
"%s.\n", checkCudaErrors(cudaGetDeviceCount(&gpu_n));
argv[0]); printf("CUDA-capable device count: %i\n", gpu_n);
printf(
"Peer to Peer access is not available amongst GPUs in the system, "
"waiving test.\n");
exit(EXIT_WAIVED); if (gpu_n < 2) {
} printf("Two or more GPUs with Peer-to-Peer access capability are required for "
"%s.\n",
// Use first pair of p2p capable GPUs detected. argv[0]);
gpuid[0] = p2pCapableGPUs[0]; printf("Waiving test.\n");
gpuid[1] = p2pCapableGPUs[1]; exit(EXIT_WAIVED);
// Enable peer access
printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0],
gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
checkCudaErrors(cudaSetDevice(gpuid[1]));
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
// Allocate buffers
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n",
int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[0]));
float *g0;
checkCudaErrors(cudaMalloc(&g0, buf_size));
checkCudaErrors(cudaSetDevice(gpuid[1]));
float *g1;
checkCudaErrors(cudaMalloc(&g1, buf_size));
float *h0;
checkCudaErrors(
cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
// Create CUDA event handles
printf("Creating event handles...\n");
cudaEvent_t start_event, stop_event;
float time_memcpy;
int eventflags = cudaEventBlockingSync;
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
// P2P memcopy() benchmark
checkCudaErrors(cudaEventRecord(start_event, 0));
for (int i = 0; i < 100; i++) {
// With UVA we don't need to specify source and target devices, the
// runtime figures this out by itself from the pointers
// Ping-pong copy between GPUs
if (i % 2 == 0) {
checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
} else {
checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
} }
}
checkCudaErrors(cudaEventRecord(stop_event, 0)); // Query device properties
checkCudaErrors(cudaEventSynchronize(stop_event)); cudaDeviceProp prop[64];
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); int gpuid[2]; // we want to find the first two GPU's that can support P2P
printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
gpuid[0], gpuid[1],
(1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f /
1024.0f / 1024.0f);
// Prepare host buffer and copy to GPU 0 for (int i = 0; i < gpu_n; i++) {
printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]); checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
for (int i = 0; i < buf_size / sizeof(float); i++) {
h0[i] = float(i % 4096);
}
checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
// Kernel launch configuration
const dim3 threads(512, 1);
const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer
printf(
"Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n",
gpuid[1], gpuid[0], gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[1]));
SimpleKernel<<<blocks, threads>>>(g0, g1);
checkCudaErrors(cudaDeviceSynchronize());
// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
// output to the GPU 0 buffer
printf(
"Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n",
gpuid[0], gpuid[1], gpuid[0]);
checkCudaErrors(cudaSetDevice(gpuid[0]));
SimpleKernel<<<blocks, threads>>>(g1, g0);
checkCudaErrors(cudaDeviceSynchronize());
// Copy data back to host and verify
printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
int error_count = 0;
for (int i = 0; i < buf_size / sizeof(float); i++) {
// Re-generate input data and apply 2x '* 2.0f' computation of both
// kernel runs
if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i],
(float(i % 4096) * 2.0f * 2.0f));
if (error_count++ > 10) {
break;
}
} }
} // Check possibility for peer access
printf("\nChecking GPU(s) for support of peer to peer memory access...\n");
// Disable peer access (also unregisters memory for non-UVA cases) int can_access_peer;
printf("Disabling peer access...\n"); int p2pCapableGPUs[2]; // We take only 1 pair of P2P capable GPUs
checkCudaErrors(cudaSetDevice(gpuid[0])); p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
checkCudaErrors(cudaSetDevice(gpuid[1]));
checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
// Cleanup and shutdown // Show all the combinations of supported P2P GPUs
printf("Shutting down...\n"); for (int i = 0; i < gpu_n; i++) {
checkCudaErrors(cudaEventDestroy(start_event)); for (int j = 0; j < gpu_n; j++) {
checkCudaErrors(cudaEventDestroy(stop_event)); if (i == j) {
checkCudaErrors(cudaSetDevice(gpuid[0])); continue;
checkCudaErrors(cudaFree(g0)); }
checkCudaErrors(cudaSetDevice(gpuid[1])); checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, i, j));
checkCudaErrors(cudaFree(g1)); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
checkCudaErrors(cudaFreeHost(h0)); prop[i].name,
i,
prop[j].name,
j,
can_access_peer ? "Yes" : "No");
if (can_access_peer && p2pCapableGPUs[0] == -1) {
p2pCapableGPUs[0] = i;
p2pCapableGPUs[1] = j;
}
}
}
for (int i = 0; i < gpu_n; i++) { if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1) {
checkCudaErrors(cudaSetDevice(i)); printf("Two or more GPUs with Peer-to-Peer access capability are required for "
} "%s.\n",
argv[0]);
printf("Peer to Peer access is not available amongst GPUs in the system, "
"waiving test.\n");
if (error_count != 0) { exit(EXIT_WAIVED);
printf("Test failed!\n"); }
exit(EXIT_FAILURE);
} else { // Use first pair of p2p capable GPUs detected.
printf("Test passed\n"); gpuid[0] = p2pCapableGPUs[0];
exit(EXIT_SUCCESS); gpuid[1] = p2pCapableGPUs[1];
}
// Enable peer access
printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid[0], gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[1], 0));
checkCudaErrors(cudaSetDevice(gpuid[1]));
checkCudaErrors(cudaDeviceEnablePeerAccess(gpuid[0], 0));
// Allocate buffers
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
printf(
"Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid[0], gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[0]));
float *g0;
checkCudaErrors(cudaMalloc(&g0, buf_size));
checkCudaErrors(cudaSetDevice(gpuid[1]));
float *g1;
checkCudaErrors(cudaMalloc(&g1, buf_size));
float *h0;
checkCudaErrors(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
// Create CUDA event handles
printf("Creating event handles...\n");
cudaEvent_t start_event, stop_event;
float time_memcpy;
int eventflags = cudaEventBlockingSync;
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
// P2P memcopy() benchmark
checkCudaErrors(cudaEventRecord(start_event, 0));
for (int i = 0; i < 100; i++) {
// With UVA we don't need to specify source and target devices, the
// runtime figures this out by itself from the pointers
// Ping-pong copy between GPUs
if (i % 2 == 0) {
checkCudaErrors(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
}
else {
checkCudaErrors(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
}
}
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
printf("cudaMemcpyPeer / cudaMemcpy between GPU%d and GPU%d: %.2fGB/s\n",
gpuid[0],
gpuid[1],
(1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
// Prepare host buffer and copy to GPU 0
printf("Preparing host buffer and memcpy to GPU%d...\n", gpuid[0]);
for (int i = 0; i < buf_size / sizeof(float); i++) {
h0[i] = float(i % 4096);
}
checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
// Kernel launch configuration
const dim3 threads(512, 1);
const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer
printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n",
gpuid[1],
gpuid[0],
gpuid[1]);
checkCudaErrors(cudaSetDevice(gpuid[1]));
SimpleKernel<<<blocks, threads>>>(g0, g1);
checkCudaErrors(cudaDeviceSynchronize());
// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
// output to the GPU 0 buffer
printf("Run kernel on GPU%d, taking source data from GPU%d and writing to "
"GPU%d...\n",
gpuid[0],
gpuid[1],
gpuid[0]);
checkCudaErrors(cudaSetDevice(gpuid[0]));
SimpleKernel<<<blocks, threads>>>(g1, g0);
checkCudaErrors(cudaDeviceSynchronize());
// Copy data back to host and verify
printf("Copy data back to host from GPU%d and verify results...\n", gpuid[0]);
checkCudaErrors(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
int error_count = 0;
for (int i = 0; i < buf_size / sizeof(float); i++) {
// Re-generate input data and apply 2x '* 2.0f' computation of both
// kernel runs
if (h0[i] != float(i % 4096) * 2.0f * 2.0f) {
printf("Verification error @ element %i: val = %f, ref = %f\n", i, h0[i], (float(i % 4096) * 2.0f * 2.0f));
if (error_count++ > 10) {
break;
}
}
}
// Disable peer access (also unregisters memory for non-UVA cases)
printf("Disabling peer access...\n");
checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[1]));
checkCudaErrors(cudaSetDevice(gpuid[1]));
checkCudaErrors(cudaDeviceDisablePeerAccess(gpuid[0]));
// Cleanup and shutdown
printf("Shutting down...\n");
checkCudaErrors(cudaEventDestroy(start_event));
checkCudaErrors(cudaEventDestroy(stop_event));
checkCudaErrors(cudaSetDevice(gpuid[0]));
checkCudaErrors(cudaFree(g0));
checkCudaErrors(cudaSetDevice(gpuid[1]));
checkCudaErrors(cudaFree(g1));
checkCudaErrors(cudaFreeHost(h0));
for (int i = 0; i < gpu_n; i++) {
checkCudaErrors(cudaSetDevice(i));
}
if (error_count != 0) {
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
else {
printf("Test passed\n");
exit(EXIT_SUCCESS);
}
} }

View File

@ -26,16 +26,16 @@
*/ */
/* pitchLinearTexture /* pitchLinearTexture
* *
* This example demonstrates how to use textures bound to pitch linear memory. * This example demonstrates how to use textures bound to pitch linear memory.
* It performs a shift of matrix elements using wrap addressing mode (aka * It performs a shift of matrix elements using wrap addressing mode (aka
* periodic boundary conditions) on two arrays, a pitch linear and a CUDA array, * periodic boundary conditions) on two arrays, a pitch linear and a CUDA array,
* in order to highlight the differences in using each. * in order to highlight the differences in using each.
* *
* Textures binding to pitch linear memory is a new feature in CUDA 2.2, * Textures binding to pitch linear memory is a new feature in CUDA 2.2,
* and allows use of texture features such as wrap addressing mode and * and allows use of texture features such as wrap addressing mode and
* filtering which are not possible with textures bound to regular linear memory * filtering which are not possible with textures bound to regular linear memory
*/ */
// includes, system // includes, system
#include <stdio.h> #include <stdio.h>
@ -50,13 +50,13 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions // CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
#define NUM_REPS 100 // number of repetitions performed #define NUM_REPS 100 // number of repetitions performed
#define TILE_DIM 16 // tile/block size #define TILE_DIM 16 // tile/block size
const char *sSDKsample = "simplePitchLinearTexture"; const char *sSDKsample = "simplePitchLinearTexture";
@ -70,29 +70,26 @@ bool bTestResult = true;
//! Shifts matrix elements using pitch linear array //! Shifts matrix elements using pitch linear array
//! @param odata output data in global memory //! @param odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void shiftPitchLinear(float *odata, int pitch, int width, int height, __global__ void
int shiftX, int shiftY, shiftPitchLinear(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefPL)
cudaTextureObject_t texRefPL) { {
int xid = blockIdx.x * blockDim.x + threadIdx.x; int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y; int yid = blockIdx.y * blockDim.y + threadIdx.y;
odata[yid * pitch + xid] = tex2D<float>( odata[yid * pitch + xid] = tex2D<float>(texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
texRefPL, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Shifts matrix elements using regular array //! Shifts matrix elements using regular array
//! @param odata output data in global memory //! @param odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void shiftArray(float *odata, int pitch, int width, int height, __global__ void
int shiftX, int shiftY, shiftArray(float *odata, int pitch, int width, int height, int shiftX, int shiftY, cudaTextureObject_t texRefArray)
cudaTextureObject_t texRefArray) { {
int xid = blockIdx.x * blockDim.x + threadIdx.x; int xid = blockIdx.x * blockDim.x + threadIdx.x;
int yid = blockIdx.y * blockDim.y + threadIdx.y; int yid = blockIdx.y * blockDim.y + threadIdx.y;
odata[yid * pitch + xid] = odata[yid * pitch + xid] = tex2D<float>(texRefArray, (xid + shiftX) / (float)width, (yid + shiftY) / (float)height);
tex2D<float>(texRefArray, (xid + shiftX) / (float)width,
(yid + shiftY) / (float)height);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -102,210 +99,199 @@ void runTest(int argc, char **argv);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n\n", sSDKsample); {
printf("%s starting...\n\n", sSDKsample);
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sSDKsample, printf("%s completed, returned %s\n", sSDKsample, bTestResult ? "OK" : "ERROR!");
bTestResult ? "OK" : "ERROR!"); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
// Set array size {
const int nx = 2048; // Set array size
const int ny = 2048; const int nx = 2048;
const int ny = 2048;
// Setup shifts applied to x and y data // Setup shifts applied to x and y data
const int x_shift = 5; const int x_shift = 5;
const int y_shift = 7; const int y_shift = 7;
if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) { if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) {
printf("nx and ny must be multiples of TILE_DIM\n"); printf("nx and ny must be multiples of TILE_DIM\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
// Setup execution configuration parameters
dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
// This will pick the best possible CUDA capable device
int devID = findCudaDevice(argc, (const char **)argv);
// CUDA events for timing
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// Host allocation and initialization
float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
float *gold = (float *)malloc(sizeof(float) * nx * ny);
for (int i = 0; i < nx * ny; ++i) {
h_idata[i] = (float)i;
}
// Device memory allocation
// Pitch linear input data
float *d_idataPL;
size_t d_pitchBytes;
checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes,
nx * sizeof(float), ny));
// Array input data
cudaArray *d_idataArray;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
// Pitch linear output data
float *d_odata;
checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes,
nx * sizeof(float), ny));
// Copy host data to device
// Pitch linear
size_t h_pitchBytes = nx * sizeof(float);
checkCudaErrors(cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes,
nx * sizeof(float), ny, cudaMemcpyHostToDevice));
// Array
checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata,
nx * ny * sizeof(float),
cudaMemcpyHostToDevice));
cudaTextureObject_t texRefPL;
cudaTextureObject_t texRefArray;
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypePitch2D;
texRes.res.pitch2D.devPtr = d_idataPL;
texRes.res.pitch2D.desc = channelDesc;
texRes.res.pitch2D.width = nx;
texRes.res.pitch2D.height = ny;
texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
memset(&texRes, 0, sizeof(cudaResourceDesc));
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_idataArray;
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(
cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
// Reference calculation
for (int j = 0; j < ny; ++j) {
int jshift = (j + y_shift) % ny;
for (int i = 0; i < nx; ++i) {
int ishift = (i + x_shift) % nx;
gold[j * nx + i] = h_idata[jshift * nx + ishift];
} }
}
// Run ShiftPitchLinear kernel // Setup execution configuration parameters
checkCudaErrors( dim3 dimGrid(nx / TILE_DIM, ny / TILE_DIM), dimBlock(TILE_DIM, TILE_DIM);
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0)); // This will pick the best possible CUDA capable device
int devID = findCudaDevice(argc, (const char **)argv);
for (int i = 0; i < NUM_REPS; ++i) { // CUDA events for timing
shiftPitchLinear<<<dimGrid, dimBlock>>>(d_odata, cudaEvent_t start, stop;
(int)(d_pitchBytes / sizeof(float)), cudaEventCreate(&start);
nx, ny, x_shift, y_shift, texRefPL); cudaEventCreate(&stop);
}
checkCudaErrors(cudaEventRecord(stop, 0)); // Host allocation and initialization
checkCudaErrors(cudaEventSynchronize(stop)); float *h_idata = (float *)malloc(sizeof(float) * nx * ny);
float timePL; float *h_odata = (float *)malloc(sizeof(float) * nx * ny);
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop)); float *gold = (float *)malloc(sizeof(float) * nx * ny);
// Check results for (int i = 0; i < nx * ny; ++i) {
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, h_idata[i] = (float)i;
nx * sizeof(float), ny, cudaMemcpyDeviceToHost)); }
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f); // Device memory allocation
// Pitch linear input data
float *d_idataPL;
size_t d_pitchBytes;
bTestResult = true; checkCudaErrors(cudaMallocPitch((void **)&d_idataPL, &d_pitchBytes, nx * sizeof(float), ny));
if (res == false) { // Array input data
printf("*** shiftPitchLinear failed ***\n"); cudaArray *d_idataArray;
bTestResult = false; cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
}
// Run ShiftArray kernel checkCudaErrors(cudaMallocArray(&d_idataArray, &channelDesc, nx, ny));
checkCudaErrors(
cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) { // Pitch linear output data
shiftArray<<<dimGrid, dimBlock>>>(d_odata, float *d_odata;
(int)(d_pitchBytes / sizeof(float)), nx, checkCudaErrors(cudaMallocPitch((void **)&d_odata, &d_pitchBytes, nx * sizeof(float), ny));
ny, x_shift, y_shift, texRefArray);
}
checkCudaErrors(cudaEventRecord(stop, 0)); // Copy host data to device
checkCudaErrors(cudaEventSynchronize(stop)); // Pitch linear
float timeArray; size_t h_pitchBytes = nx * sizeof(float);
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
// Check results checkCudaErrors(
checkCudaErrors(cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, cudaMemcpy2D(d_idataPL, d_pitchBytes, h_idata, h_pitchBytes, nx * sizeof(float), ny, cudaMemcpyHostToDevice));
nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
if (res == false) { // Array
printf("*** shiftArray failed ***\n"); checkCudaErrors(cudaMemcpyToArray(d_idataArray, 0, 0, h_idata, nx * ny * sizeof(float), cudaMemcpyHostToDevice));
bTestResult = false;
}
float bandwidthPL = cudaTextureObject_t texRefPL;
2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS); cudaTextureObject_t texRefArray;
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / cudaResourceDesc texRes;
(timeArray / NUM_REPS); memset(&texRes, 0, sizeof(cudaResourceDesc));
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", texRes.resType = cudaResourceTypePitch2D;
bandwidthPL, bandwidthArray); texRes.res.pitch2D.devPtr = d_idataPL;
texRes.res.pitch2D.desc = channelDesc;
texRes.res.pitch2D.width = nx;
texRes.res.pitch2D.height = ny;
texRes.res.pitch2D.pitchInBytes = h_pitchBytes;
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS)); texDescr.normalizedCoords = true;
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS)); texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
printf( checkCudaErrors(cudaCreateTextureObject(&texRefPL, &texRes, &texDescr, NULL));
"\nTexture fetch rate (Mpix/s) for pitch linear: " memset(&texRes, 0, sizeof(cudaResourceDesc));
"%.2e; for array: %.2e\n\n", memset(&texDescr, 0, sizeof(cudaTextureDesc));
fetchRatePL, fetchRateArray); texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_idataArray;
texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&texRefArray, &texRes, &texDescr, NULL));
// Cleanup // Reference calculation
free(h_idata); for (int j = 0; j < ny; ++j) {
free(h_odata); int jshift = (j + y_shift) % ny;
free(gold);
checkCudaErrors(cudaDestroyTextureObject(texRefPL)); for (int i = 0; i < nx; ++i) {
checkCudaErrors(cudaDestroyTextureObject(texRefArray)); int ishift = (i + x_shift) % nx;
checkCudaErrors(cudaFree(d_idataPL)); gold[j * nx + i] = h_idata[jshift * nx + ishift];
checkCudaErrors(cudaFreeArray(d_idataArray)); }
checkCudaErrors(cudaFree(d_odata)); }
checkCudaErrors(cudaEventDestroy(start)); // Run ShiftPitchLinear kernel
checkCudaErrors(cudaEventDestroy(stop)); checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) {
shiftPitchLinear<<<dimGrid, dimBlock>>>(
d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefPL);
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
float timePL;
checkCudaErrors(cudaEventElapsedTime(&timePL, start, stop));
// Check results
checkCudaErrors(
cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
bool res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
bTestResult = true;
if (res == false) {
printf("*** shiftPitchLinear failed ***\n");
bTestResult = false;
}
// Run ShiftArray kernel
checkCudaErrors(cudaMemset2D(d_odata, d_pitchBytes, 0, nx * sizeof(float), ny));
checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < NUM_REPS; ++i) {
shiftArray<<<dimGrid, dimBlock>>>(
d_odata, (int)(d_pitchBytes / sizeof(float)), nx, ny, x_shift, y_shift, texRefArray);
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
float timeArray;
checkCudaErrors(cudaEventElapsedTime(&timeArray, start, stop));
// Check results
checkCudaErrors(
cudaMemcpy2D(h_odata, h_pitchBytes, d_odata, d_pitchBytes, nx * sizeof(float), ny, cudaMemcpyDeviceToHost));
res = compareData(gold, h_odata, nx * ny, 0.0f, 0.15f);
if (res == false) {
printf("*** shiftArray failed ***\n");
bTestResult = false;
}
float bandwidthPL = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timePL / NUM_REPS);
float bandwidthArray = 2.f * 1000.f * nx * ny * sizeof(float) / (1.e+9f) / (timeArray / NUM_REPS);
printf("\nBandwidth (GB/s) for pitch linear: %.2e; for array: %.2e\n", bandwidthPL, bandwidthArray);
float fetchRatePL = nx * ny / 1.e+6f / (timePL / (1000.0f * NUM_REPS));
float fetchRateArray = nx * ny / 1.e+6f / (timeArray / (1000.0f * NUM_REPS));
printf("\nTexture fetch rate (Mpix/s) for pitch linear: "
"%.2e; for array: %.2e\n\n",
fetchRatePL,
fetchRateArray);
// Cleanup
free(h_idata);
free(h_odata);
free(gold);
checkCudaErrors(cudaDestroyTextureObject(texRefPL));
checkCudaErrors(cudaDestroyTextureObject(texRefArray));
checkCudaErrors(cudaFree(d_idataPL));
checkCudaErrors(cudaFreeArray(d_idataArray));
checkCudaErrors(cudaFree(d_odata));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
} }

View File

@ -26,48 +26,49 @@
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
#endif #endif
__global__ void testKernel(int val) { __global__ void testKernel(int val)
printf("[%d, %d]:\t\tValue is:%d\n", blockIdx.y * gridDim.x + blockIdx.x, {
threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + printf("[%d, %d]:\t\tValue is:%d\n",
threadIdx.x, blockIdx.y * gridDim.x + blockIdx.x,
val); threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x,
val);
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
int devID; {
cudaDeviceProp props; int devID;
cudaDeviceProp props;
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv); devID = findCudaDevice(argc, (const char **)argv);
// Get GPU information // Get GPU information
checkCudaErrors(cudaGetDevice(&devID)); checkCudaErrors(cudaGetDevice(&devID));
checkCudaErrors(cudaGetDeviceProperties(&props, devID)); checkCudaErrors(cudaGetDeviceProperties(&props, devID));
printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, printf("Device %d: \"%s\" with Compute capability %d.%d\n", devID, props.name, props.major, props.minor);
props.major, props.minor);
printf("printf() is called. Output:\n\n"); printf("printf() is called. Output:\n\n");
// Kernel configuration, where a two-dimensional grid and // Kernel configuration, where a two-dimensional grid and
// three-dimensional blocks are configured. // three-dimensional blocks are configured.
dim3 dimGrid(2, 2); dim3 dimGrid(2, 2);
dim3 dimBlock(2, 2, 2); dim3 dimBlock(2, 2, 2);
testKernel<<<dimGrid, dimBlock>>>(10); testKernel<<<dimGrid, dimBlock>>>(10);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -44,141 +44,137 @@
* *
* Elapsed times are averaged over nreps repetitions (10 by default). * Elapsed times are averaged over nreps repetitions (10 by default).
* *
*/ */
const char *sSDKsample = "simpleStreams"; const char *sSDKsample = "simpleStreams";
const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", const char *sEventSyncMethod[] = {"cudaEventDefault", "cudaEventBlockingSync", "cudaEventDisableTiming", NULL};
"cudaEventDisableTiming", NULL};
const char *sDeviceSyncMethod[] = { const char *sDeviceSyncMethod[] = {"cudaDeviceScheduleAuto",
"cudaDeviceScheduleAuto", "cudaDeviceScheduleSpin", "cudaDeviceScheduleSpin",
"cudaDeviceScheduleYield", "INVALID", "cudaDeviceScheduleYield",
"cudaDeviceScheduleBlockingSync", NULL}; "INVALID",
"cudaDeviceScheduleBlockingSync",
NULL};
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <stdio.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef WIN32 #ifndef WIN32
#include <sys/mman.h> // for mmap() / munmap() #include <sys/mman.h> // for mmap() / munmap()
#endif #endif
// Macro to aligned up to the memory size in question // Macro to aligned up to the memory size in question
#define MEMORY_ALIGNMENT 4096 #define MEMORY_ALIGNMENT 4096
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
__global__ void init_array(int *g_data, int *factor, int num_iterations) { __global__ void init_array(int *g_data, int *factor, int num_iterations)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < num_iterations; i++) { for (int i = 0; i < num_iterations; i++) {
g_data[idx] += *factor; // non-coalesced on purpose, to burn time g_data[idx] += *factor; // non-coalesced on purpose, to burn time
}
}
bool correct_data(int *a, const int n, const int c) {
for (int i = 0; i < n; i++) {
if (a[i] != c) {
printf("%d: %d %d\n", i, a[i], c);
return false;
} }
}
return true;
} }
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, bool correct_data(int *a, const int n, const int c)
int **ppAligned_a, int nbytes) { {
for (int i = 0; i < n; i++) {
if (a[i] != c) {
printf("%d: %d %d\n", i, a[i], c);
return false;
}
}
return true;
}
inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
{
#if CUDART_VERSION >= 4000 #if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__) #if !defined(__arm__) && !defined(__aarch64__)
if (bPinGenericMemory) { if (bPinGenericMemory) {
// allocate a generic page-aligned chunk of system memory // allocate a generic page-aligned chunk of system memory
#ifdef WIN32 #ifdef WIN32
printf( printf("> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned "
"> VirtualAlloc() allocating %4.2f Mbytes of (generic page-aligned " "system memory)\n",
"system memory)\n", (float)nbytes / 1048576.0f);
(float)nbytes / 1048576.0f); *pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
*pp_a = (int *)VirtualAlloc(NULL, (nbytes + MEMORY_ALIGNMENT),
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
#else #else
printf( printf("> mmap() allocating %4.2f Mbytes (generic page-aligned system "
"> mmap() allocating %4.2f Mbytes (generic page-aligned system " "memory)\n",
"memory)\n", (float)nbytes / 1048576.0f);
(float)nbytes / 1048576.0f); *pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
*pp_a = (int *)mmap(NULL, (nbytes + MEMORY_ALIGNMENT),
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
#endif #endif
*ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT); *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);
printf( printf("> cudaHostRegister() registering %4.2f Mbytes of generic allocated "
"> cudaHostRegister() registering %4.2f Mbytes of generic allocated " "system memory\n",
"system memory\n", (float)nbytes / 1048576.0f);
(float)nbytes / 1048576.0f); // pin allocate memory
// pin allocate memory checkCudaErrors(cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped));
checkCudaErrors( }
cudaHostRegister(*ppAligned_a, nbytes, cudaHostRegisterMapped)); else
} else
#endif #endif
#endif #endif
{ {
printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", printf("> cudaMallocHost() allocating %4.2f Mbytes of system memory\n", (float)nbytes / 1048576.0f);
(float)nbytes / 1048576.0f); // allocate host memory (pinned is required for achieve asynchronicity)
// allocate host memory (pinned is required for achieve asynchronicity) checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes));
checkCudaErrors(cudaMallocHost((void **)pp_a, nbytes)); *ppAligned_a = *pp_a;
*ppAligned_a = *pp_a; }
}
} }
inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, inline void FreeHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nbytes)
int **ppAligned_a, int nbytes) { {
#if CUDART_VERSION >= 4000 #if CUDART_VERSION >= 4000
#if !defined(__arm__) && !defined(__aarch64__) #if !defined(__arm__) && !defined(__aarch64__)
// CUDA 4.0 support pinning of generic host memory // CUDA 4.0 support pinning of generic host memory
if (bPinGenericMemory) { if (bPinGenericMemory) {
// unpin and delete host memory // unpin and delete host memory
checkCudaErrors(cudaHostUnregister(*ppAligned_a)); checkCudaErrors(cudaHostUnregister(*ppAligned_a));
#ifdef WIN32 #ifdef WIN32
VirtualFree(*pp_a, 0, MEM_RELEASE); VirtualFree(*pp_a, 0, MEM_RELEASE);
#else #else
munmap(*pp_a, nbytes); munmap(*pp_a, nbytes);
#endif #endif
} else }
else
#endif #endif
#endif #endif
{ {
cudaFreeHost(*pp_a); cudaFreeHost(*pp_a);
} }
} }
static const char *sSyncMethod[] = { static const char *sSyncMethod[] = {"0 (Automatic Blocking)",
"0 (Automatic Blocking)", "1 (Spin Blocking)",
"1 (Spin Blocking)", "2 (Yield Blocking)",
"2 (Yield Blocking)", "3 (Undefined Blocking Method)",
"3 (Undefined Blocking Method)", "4 (Blocking Sync Event) = low CPU utilization",
"4 (Blocking Sync Event) = low CPU utilization", NULL};
NULL};
void printHelp() { void printHelp()
printf("Usage: %s [options below]\n", sSDKsample); {
printf("\t--sync_method=n for CPU/GPU synchronization\n"); printf("Usage: %s [options below]\n", sSDKsample);
printf("\t n=%s\n", sSyncMethod[0]); printf("\t--sync_method=n for CPU/GPU synchronization\n");
printf("\t n=%s\n", sSyncMethod[1]); printf("\t n=%s\n", sSyncMethod[0]);
printf("\t n=%s\n", sSyncMethod[2]); printf("\t n=%s\n", sSyncMethod[1]);
printf("\t <Default> n=%s\n", sSyncMethod[4]); printf("\t n=%s\n", sSyncMethod[2]);
printf( printf("\t <Default> n=%s\n", sSyncMethod[4]);
"\t--use_generic_memory (default) use generic page-aligned for system " printf("\t--use_generic_memory (default) use generic page-aligned for system "
"memory\n"); "memory\n");
printf( printf("\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate "
"\t--use_cuda_malloc_host (optional) use cudaMallocHost to allocate " "system memory\n");
"system memory\n");
} }
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
@ -187,259 +183,240 @@ void printHelp() {
#define DEFAULT_PINNED_GENERIC_MEMORY true #define DEFAULT_PINNED_GENERIC_MEMORY true
#endif #endif
int main(int argc, char **argv) { int main(int argc, char **argv)
int cuda_device = 0; {
int nstreams = 4; // number of streams for CUDA calls int cuda_device = 0;
int nreps = 10; // number of times each experiment is repeated int nstreams = 4; // number of streams for CUDA calls
int n = 16 * 1024 * 1024; // number of ints in the data set int nreps = 10; // number of times each experiment is repeated
int nbytes = n * sizeof(int); // number of data bytes int n = 16 * 1024 * 1024; // number of ints in the data set
dim3 threads, blocks; // kernel launch configuration int nbytes = n * sizeof(int); // number of data bytes
float elapsed_time, time_memcpy, time_kernel; // timing variables dim3 threads, blocks; // kernel launch configuration
float scale_factor = 1.0f; float elapsed_time, time_memcpy, time_kernel; // timing variables
float scale_factor = 1.0f;
// allocate generic memory and pin it laster instead of using cudaHostAlloc() // allocate generic memory and pin it laster instead of using cudaHostAlloc()
bool bPinGenericMemory = bool bPinGenericMemory = DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior
DEFAULT_PINNED_GENERIC_MEMORY; // we want this to be the default behavior int device_sync_method = cudaDeviceBlockingSync; // by default we use BlockingSync
int device_sync_method =
cudaDeviceBlockingSync; // by default we use BlockingSync
int niterations; // number of iterations for the loop inside the kernel int niterations; // number of iterations for the loop inside the kernel
printf("[ %s ]\n\n", sSDKsample); printf("[ %s ]\n\n", sSDKsample);
if (checkCmdLineFlag(argc, (const char **)argv, "help")) { if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
printHelp(); printHelp();
return EXIT_SUCCESS; return EXIT_SUCCESS;
}
if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv,
"sync_method")) >= 0) {
if (device_sync_method == 0 || device_sync_method == 1 ||
device_sync_method == 2 || device_sync_method == 4) {
printf("Device synchronization method set to = %s\n",
sSyncMethod[device_sync_method]);
printf("Setting reps to 100 to demonstrate steady state\n");
nreps = 100;
} else {
printf("Invalid command line option sync_method=\"%d\"\n",
device_sync_method);
return EXIT_FAILURE;
} }
} else {
printHelp();
return EXIT_SUCCESS;
}
if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { if ((device_sync_method = getCmdLineArgumentInt(argc, (const char **)argv, "sync_method")) >= 0) {
if (device_sync_method == 0 || device_sync_method == 1 || device_sync_method == 2 || device_sync_method == 4) {
printf("Device synchronization method set to = %s\n", sSyncMethod[device_sync_method]);
printf("Setting reps to 100 to demonstrate steady state\n");
nreps = 100;
}
else {
printf("Invalid command line option sync_method=\"%d\"\n", device_sync_method);
return EXIT_FAILURE;
}
}
else {
printHelp();
return EXIT_SUCCESS;
}
if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
bPinGenericMemory = false; // Generic Pinning of System Paged memory not bPinGenericMemory = false; // Generic Pinning of System Paged memory not
// currently supported on Mac OSX // currently supported on Mac OSX
#else #else
bPinGenericMemory = true; bPinGenericMemory = true;
#endif #endif
}
if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
bPinGenericMemory = false;
}
printf("\n> ");
cuda_device = findCudaDevice(argc, (const char **)argv);
// check the compute capability of the device
int num_devices = 0;
checkCudaErrors(cudaGetDeviceCount(&num_devices));
if (0 == num_devices) {
printf(
"your system does not have a CUDA capable device, waiving test...\n");
return EXIT_WAIVED;
}
// check if the command-line chosen device ID is within range, exit if not
if (cuda_device >= num_devices) {
printf(
"cuda_device=%d is invalid, must choose device ID between 0 and %d\n",
cuda_device, num_devices - 1);
return EXIT_FAILURE;
}
checkCudaErrors(cudaSetDevice(cuda_device));
// Checking for compute capabilities
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
niterations = 5;
// Check if GPU can map host memory (Generic Method), if not then we override
// bPinGenericMemory to be false
if (bPinGenericMemory) {
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name,
deviceProp.canMapHostMemory ? "Yes" : "No");
if (deviceProp.canMapHostMemory == 0) {
printf(
"Using cudaMallocHost, CUDA device does not support mapping of "
"generic host memory\n");
bPinGenericMemory = false;
} }
}
// Anything that is less than 32 Cores will have scaled down workload if (checkCmdLineFlag(argc, (const char **)argv, "use_cuda_malloc_host")) {
scale_factor = bPinGenericMemory = false;
max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * }
(float)deviceProp.multiProcessorCount)),
1.0f);
n = (int)rint((float)n / scale_factor);
printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, printf("\n> ");
deviceProp.minor); cuda_device = findCudaDevice(argc, (const char **)argv);
printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
deviceProp.multiProcessorCount);
printf("> scale_factor = %1.4f\n", 1.0f / scale_factor); // check the compute capability of the device
printf("> array_size = %d\n\n", n); int num_devices = 0;
checkCudaErrors(cudaGetDeviceCount(&num_devices));
// enable use of blocking sync, to reduce CPU usage if (0 == num_devices) {
printf("> Using CPU/GPU Device Synchronization method (%s)\n", printf("your system does not have a CUDA capable device, waiving test...\n");
sDeviceSyncMethod[device_sync_method]); return EXIT_WAIVED;
checkCudaErrors(cudaSetDeviceFlags( }
device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
// allocate host memory // check if the command-line chosen device ID is within range, exit if not
int c = 5; // value to which the array will be initialized if (cuda_device >= num_devices) {
int *h_a = 0; // pointer to the array data in host memory printf("cuda_device=%d is invalid, must choose device ID between 0 and %d\n", cuda_device, num_devices - 1);
int *hAligned_a = 0; // pointer to the array data in host memory (aligned to return EXIT_FAILURE;
// MEMORY_ALIGNMENT) }
// Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if checkCudaErrors(cudaSetDevice(cuda_device));
// using the new CUDA 4.0 features
AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
// allocate device memory // Checking for compute capabilities
int *d_a = 0, cudaDeviceProp deviceProp;
*d_c = 0; // pointers to data and init value in the device memory checkCudaErrors(cudaGetDeviceProperties(&deviceProp, cuda_device));
checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
printf("\nStarting Test\n"); niterations = 5;
// allocate and initialize an array of stream handles // Check if GPU can map host memory (Generic Method), if not then we override
cudaStream_t *streams = // bPinGenericMemory to be false
(cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t)); if (bPinGenericMemory) {
printf("Device: <%s> canMapHostMemory: %s\n", deviceProp.name, deviceProp.canMapHostMemory ? "Yes" : "No");
for (int i = 0; i < nstreams; i++) { if (deviceProp.canMapHostMemory == 0) {
checkCudaErrors(cudaStreamCreate(&(streams[i]))); printf("Using cudaMallocHost, CUDA device does not support mapping of "
} "generic host memory\n");
bPinGenericMemory = false;
}
}
// create CUDA event handles // Anything that is less than 32 Cores will have scaled down workload
// use blocking sync scale_factor =
cudaEvent_t start_event, stop_event; max((32.0f / (_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * (float)deviceProp.multiProcessorCount)),
int eventflags = 1.0f);
((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync n = (int)rint((float)n / scale_factor);
: cudaEventDefault);
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags)); printf("> CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags)); printf("> %d Multiprocessor(s) x %d (Cores/Multiprocessor) = %d (Cores)\n",
deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
// time memcopy from device printf("> scale_factor = %1.4f\n", 1.0f / scale_factor);
checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to printf("> array_size = %d\n\n", n);
// ensure that all previous
// CUDA calls have
// completed
checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes,
cudaMemcpyDeviceToHost, streams[0]));
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(
stop_event)); // block until the event is actually recorded
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
printf("memcopy:\t%.2f\n", time_memcpy);
// time kernel // enable use of blocking sync, to reduce CPU usage
threads = dim3(512, 1); printf("> Using CPU/GPU Device Synchronization method (%s)\n", sDeviceSyncMethod[device_sync_method]);
blocks = dim3(n / threads.x, 1); checkCudaErrors(cudaSetDeviceFlags(device_sync_method | (bPinGenericMemory ? cudaDeviceMapHost : 0)));
checkCudaErrors(cudaEventRecord(start_event, 0));
init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
printf("kernel:\t\t%.2f\n", time_kernel);
////////////////////////////////////////////////////////////////////// // allocate host memory
// time non-streamed execution for reference int c = 5; // value to which the array will be initialized
threads = dim3(512, 1); int *h_a = 0; // pointer to the array data in host memory
blocks = dim3(n / threads.x, 1); int *hAligned_a = 0; // pointer to the array data in host memory (aligned to
checkCudaErrors(cudaEventRecord(start_event, 0)); // MEMORY_ALIGNMENT)
for (int k = 0; k < nreps; k++) { // Allocate Host memory (could be using cudaMallocHost or VirtualAlloc/mmap if
init_array<<<blocks, threads>>>(d_a, d_c, niterations); // using the new CUDA 4.0 features
checkCudaErrors( AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
}
checkCudaErrors(cudaEventRecord(stop_event, 0)); // allocate device memory
checkCudaErrors(cudaEventSynchronize(stop_event)); int *d_a = 0,
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event)); *d_c = 0; // pointers to data and init value in the device memory
printf("non-streamed:\t%.2f\n", elapsed_time / nreps); checkCudaErrors(cudaMalloc((void **)&d_a, nbytes));
checkCudaErrors(cudaMemset(d_a, 0x0, nbytes));
checkCudaErrors(cudaMalloc((void **)&d_c, sizeof(int)));
checkCudaErrors(cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice));
////////////////////////////////////////////////////////////////////// printf("\nStarting Test\n");
// time execution with nstreams streams
threads = dim3(512, 1); // allocate and initialize an array of stream handles
blocks = dim3(n / (nstreams * threads.x), 1); cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
memset(hAligned_a, 255,
nbytes); // set host memory bits to all 1s, for testing correctness
checkCudaErrors(cudaMemset(
d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
checkCudaErrors(cudaEventRecord(start_event, 0));
for (int k = 0; k < nreps; k++) {
// asynchronously launch nstreams kernels, each operating on its own portion
// of data
for (int i = 0; i < nstreams; i++) { for (int i = 0; i < nstreams; i++) {
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, checkCudaErrors(cudaStreamCreate(&(streams[i])));
d_c, niterations);
} }
// asynchronously launch nstreams memcopies. Note that memcopy in stream x // create CUDA event handles
// will only // use blocking sync
// commence executing when all previous CUDA calls in stream x have cudaEvent_t start_event, stop_event;
// completed int eventflags = ((device_sync_method == cudaDeviceBlockingSync) ? cudaEventBlockingSync : cudaEventDefault);
checkCudaErrors(cudaEventCreateWithFlags(&start_event, eventflags));
checkCudaErrors(cudaEventCreateWithFlags(&stop_event, eventflags));
// time memcopy from device
checkCudaErrors(cudaEventRecord(start_event, 0)); // record in stream-0, to
// ensure that all previous
// CUDA calls have
// completed
checkCudaErrors(cudaMemcpyAsync(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost, streams[0]));
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event)); // block until the event is actually recorded
checkCudaErrors(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
printf("memcopy:\t%.2f\n", time_memcpy);
// time kernel
threads = dim3(512, 1);
blocks = dim3(n / threads.x, 1);
checkCudaErrors(cudaEventRecord(start_event, 0));
init_array<<<blocks, threads, 0, streams[0]>>>(d_a, d_c, niterations);
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&time_kernel, start_event, stop_event));
printf("kernel:\t\t%.2f\n", time_kernel);
//////////////////////////////////////////////////////////////////////
// time non-streamed execution for reference
threads = dim3(512, 1);
blocks = dim3(n / threads.x, 1);
checkCudaErrors(cudaEventRecord(start_event, 0));
for (int k = 0; k < nreps; k++) {
init_array<<<blocks, threads>>>(d_a, d_c, niterations);
checkCudaErrors(cudaMemcpy(hAligned_a, d_a, nbytes, cudaMemcpyDeviceToHost));
}
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
//////////////////////////////////////////////////////////////////////
// time execution with nstreams streams
threads = dim3(512, 1);
blocks = dim3(n / (nstreams * threads.x), 1);
memset(hAligned_a, 255,
nbytes); // set host memory bits to all 1s, for testing correctness
checkCudaErrors(cudaMemset(d_a, 0, nbytes)); // set device memory to all 0s, for testing correctness
checkCudaErrors(cudaEventRecord(start_event, 0));
for (int k = 0; k < nreps; k++) {
// asynchronously launch nstreams kernels, each operating on its own portion
// of data
for (int i = 0; i < nstreams; i++) {
init_array<<<blocks, threads, 0, streams[i]>>>(d_a + i * n / nstreams, d_c, niterations);
}
// asynchronously launch nstreams memcopies. Note that memcopy in stream x
// will only
// commence executing when all previous CUDA calls in stream x have
// completed
for (int i = 0; i < nstreams; i++) {
checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams,
d_a + i * n / nstreams,
nbytes / nstreams,
cudaMemcpyDeviceToHost,
streams[i]));
}
}
checkCudaErrors(cudaEventRecord(stop_event, 0));
checkCudaErrors(cudaEventSynchronize(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
// check whether the output is correct
printf("-------------------------------\n");
bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
// release resources
for (int i = 0; i < nstreams; i++) { for (int i = 0; i < nstreams; i++) {
checkCudaErrors(cudaMemcpyAsync(hAligned_a + i * n / nstreams, checkCudaErrors(cudaStreamDestroy(streams[i]));
d_a + i * n / nstreams, nbytes / nstreams,
cudaMemcpyDeviceToHost, streams[i]));
} }
}
checkCudaErrors(cudaEventRecord(stop_event, 0)); checkCudaErrors(cudaEventDestroy(start_event));
checkCudaErrors(cudaEventSynchronize(stop_event)); checkCudaErrors(cudaEventDestroy(stop_event));
checkCudaErrors(cudaEventElapsedTime(&elapsed_time, start_event, stop_event));
printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
// check whether the output is correct // Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
printf("-------------------------------\n"); FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
bool bResults = correct_data(hAligned_a, n, c * nreps * niterations);
// release resources checkCudaErrors(cudaFree(d_a));
for (int i = 0; i < nstreams; i++) { checkCudaErrors(cudaFree(d_c));
checkCudaErrors(cudaStreamDestroy(streams[i]));
}
checkCudaErrors(cudaEventDestroy(start_event)); return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
checkCudaErrors(cudaEventDestroy(stop_event));
// Free cudaMallocHost or Generic Host allocated memory (from CUDA 4.0)
FreeHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nbytes);
checkCudaErrors(cudaFree(d_a));
checkCudaErrors(cudaFree(d_c));
return bResults ? EXIT_SUCCESS : EXIT_FAILURE;
} }

View File

@ -34,10 +34,10 @@
*/ */
// Includes, system // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -49,18 +49,18 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions // CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
#define MIN_EPSILON_ERROR 5e-3f #define MIN_EPSILON_ERROR 5e-3f
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Define the files that are to be save and the reference images for validation // Define the files that are to be save and the reference images for validation
const char *imageFilename = "teapot512.pgm"; const char *imageFilename = "teapot512.pgm";
const char *refFilename = "ref_rotated.pgm"; const char *refFilename = "ref_rotated.pgm";
float angle = 0.5f; // angle to rotate image by (in radians) float angle = 0.5f; // angle to rotate image by (in radians)
// Auto-Verification Code // Auto-Verification Code
bool testResult = true; bool testResult = true;
@ -73,223 +73,218 @@ static const char *sampleName = "simpleSurfaceWrite";
//! Write to a cuArray (texture data source) using surface writes //! Write to a cuArray (texture data source) using surface writes
//! @param gIData input data in global memory //! @param gIData input data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void surfaceWriteKernel(float *gIData, int width, int height, __global__ void surfaceWriteKernel(float *gIData, int width, int height, cudaSurfaceObject_t outputSurface)
cudaSurfaceObject_t outputSurface) { {
// calculate surface coordinates // calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
// read from global memory and write to cuarray (via surface reference) // read from global memory and write to cuarray (via surface reference)
surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, surf2Dwrite(gIData[y * width + x], outputSurface, x * 4, y, cudaBoundaryModeTrap);
cudaBoundaryModeTrap);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Transform an image using texture lookups //! Transform an image using texture lookups
//! @param gOData output data in global memory //! @param gOData output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *gOData, int width, int height, __global__ void transformKernel(float *gOData, int width, int height, float theta, cudaTextureObject_t tex)
float theta, cudaTextureObject_t tex) { {
// calculate normalized texture coordinates // calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / (float)width; float u = x / (float)width;
float v = y / (float)height; float v = y / (float)height;
// transform coordinates // transform coordinates
u -= 0.5f; u -= 0.5f;
v -= 0.5f; v -= 0.5f;
float tu = u * cosf(theta) - v * sinf(theta) + 0.5f; float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
float tv = v * cosf(theta) + u * sinf(theta) + 0.5f; float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
// read from texture and write to global memory // read from texture and write to global memory
gOData[y * width + x] = tex2D<float>(tex, tu, tv); gOData[y * width + x] = tex2D<float>(tex, tu, tv);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Declaration, forward // Declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", sampleName); {
printf("%s starting...\n", sampleName);
// Process command-line arguments // Process command-line arguments
if (argc > 1) { if (argc > 1) {
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
(char **)&imageFilename);
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
getCmdLineArgumentString(argc, (const char **)argv, "reference", getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
(char **)&refFilename); }
} else { else {
printf("-input flag should be used with -reference flag"); printf("-input flag should be used with -reference flag");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { }
printf("-reference flag should be used with -input flag"); else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
exit(EXIT_FAILURE); printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
} }
}
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
// Use command-line specified CUDA device, {
// otherwise use device with highest Gflops/s // Use command-line specified CUDA device,
int devID = findCudaDevice(argc, (const char **)argv); // otherwise use device with highest Gflops/s
int devID = findCudaDevice(argc, (const char **)argv);
// Get number of SMs on this GPU // Get number of SMs on this GPU
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n", printf("CUDA device [%s] has %d Multi-Processors, SM %d.%d\n",
deviceProps.name, deviceProps.multiProcessorCount, deviceProps.major, deviceProps.name,
deviceProps.minor); deviceProps.multiProcessorCount,
deviceProps.major,
deviceProps.minor);
// Load image from disk // Load image from disk
float *hData = NULL; float *hData = NULL;
unsigned int width, height; unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]); char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL) { if (imagePath == NULL) {
printf("Unable to source image input file: %s\n", imageFilename); printf("Unable to source image input file: %s\n", imageFilename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
sdkLoadPGM(imagePath, &hData, &width, &height); sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float); unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Load reference image from image (output) // Load reference image from image (output)
float *hDataRef = (float *)malloc(size); float *hDataRef = (float *)malloc(size);
char *refPath = sdkFindFilePath(refFilename, argv[0]); char *refPath = sdkFindFilePath(refFilename, argv[0]);
if (refPath == NULL) { if (refPath == NULL) {
printf("Unable to find reference image file: %s\n", refFilename); printf("Unable to find reference image file: %s\n", refFilename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
sdkLoadPGM(refPath, &hDataRef, &width, &height); sdkLoadPGM(refPath, &hDataRef, &width, &height);
// Allocate device memory for result // Allocate device memory for result
float *dData = NULL; float *dData = NULL;
checkCudaErrors(cudaMalloc((void **)&dData, size)); checkCudaErrors(cudaMalloc((void **)&dData, size));
// Allocate array and copy image data // Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray *cuArray;
cudaArray *cuArray; checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height, cudaArraySurfaceLoadStore));
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore));
dim3 dimBlock(8, 8, 1); dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
cudaSurfaceObject_t outputSurface; cudaSurfaceObject_t outputSurface;
cudaResourceDesc surfRes; cudaResourceDesc surfRes;
memset(&surfRes, 0, sizeof(cudaResourceDesc)); memset(&surfRes, 0, sizeof(cudaResourceDesc));
surfRes.resType = cudaResourceTypeArray; surfRes.resType = cudaResourceTypeArray;
surfRes.res.array.array = cuArray; surfRes.res.array.array = cuArray;
checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes)); checkCudaErrors(cudaCreateSurfaceObject(&outputSurface, &surfRes));
#if 1 #if 1
checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(dData, hData, size, cudaMemcpyHostToDevice));
surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, surfaceWriteKernel<<<dimGrid, dimBlock>>>(dData, width, height, outputSurface);
outputSurface); #else // This is what differs from the example simpleTexture
#else // This is what differs from the example simpleTexture checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
#endif #endif
cudaTextureObject_t tex; cudaTextureObject_t tex;
cudaResourceDesc texRes; cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc)); memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray; texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cuArray; texRes.res.array.array = cuArray;
cudaTextureDesc texDescr; cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc)); memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true; texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModeLinear; texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap; texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType; texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
// Warmup // Warmup
transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex); transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
// Execute the kernel // Execute the kernel
transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex); transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
// Check if kernel execution generated an error // Check if kernel execution generated an error
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
cudaDeviceSynchronize(); cudaDeviceSynchronize();
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n", printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); sdkDeleteTimer(&timer);
sdkDeleteTimer(&timer);
// Allocate mem for the result on host side // Allocate mem for the result on host side
float *hOData = (float *)malloc(size); float *hOData = (float *)malloc(size);
// copy result from device to host // copy result from device to host
checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(hOData, dData, size, cudaMemcpyDeviceToHost));
// Write result to file // Write result to file
char outputFilename[1024]; char outputFilename[1024];
strcpy(outputFilename, "output.pgm"); strcpy(outputFilename, "output.pgm");
sdkSavePGM("output.pgm", hOData, width, height); sdkSavePGM("output.pgm", hOData, width, height);
printf("Wrote '%s'\n", outputFilename); printf("Wrote '%s'\n", outputFilename);
// Write regression file if necessary // Write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// Write file for regression test // Write file for regression test
sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, sdkWriteFile<float>("./data/regression.dat", hOData, width * height, 0.0f, false);
false); }
} else { else {
// We need to reload the data from disk, // We need to reload the data from disk,
// because it is inverted upon output // because it is inverted upon output
sdkLoadPGM(outputFilename, &hOData, &width, &height); sdkLoadPGM(outputFilename, &hOData, &width, &height);
printf("Comparing files\n"); printf("Comparing files\n");
printf("\toutput: <%s>\n", outputFilename); printf("\toutput: <%s>\n", outputFilename);
printf("\treference: <%s>\n", refPath); printf("\treference: <%s>\n", refPath);
testResult = testResult = compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f);
compareData(hOData, hDataRef, width * height, MIN_EPSILON_ERROR, 0.0f); }
}
checkCudaErrors(cudaDestroySurfaceObject(outputSurface)); checkCudaErrors(cudaDestroySurfaceObject(outputSurface));
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(dData)); checkCudaErrors(cudaFree(dData));
checkCudaErrors(cudaFreeArray(cuArray)); checkCudaErrors(cudaFreeArray(cuArray));
free(imagePath); free(imagePath);
free(refPath); free(refPath);
} }

View File

@ -68,106 +68,118 @@
// this // this
// struct by putting an undefined symbol in the function body so it won't // struct by putting an undefined symbol in the function body so it won't
// compile. // compile.
template <typename T> template <typename T> struct SharedMemory
struct SharedMemory { {
// Ensure that we won't compile any un-specialized types // Ensure that we won't compile any un-specialized types
__device__ T *getPointer() { __device__ T *getPointer()
extern __device__ void error(void); {
error(); extern __device__ void error(void);
return NULL; error();
} return NULL;
}
}; };
// Following are the specializations for the following types. // Following are the specializations for the following types.
// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double // int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
// One could also specialize it for user-defined types. // One could also specialize it for user-defined types.
template <> template <> struct SharedMemory<int>
struct SharedMemory<int> { {
__device__ int *getPointer() { __device__ int *getPointer()
extern __shared__ int s_int[]; {
return s_int; extern __shared__ int s_int[];
} return s_int;
}
}; };
template <> template <> struct SharedMemory<unsigned int>
struct SharedMemory<unsigned int> { {
__device__ unsigned int *getPointer() { __device__ unsigned int *getPointer()
extern __shared__ unsigned int s_uint[]; {
return s_uint; extern __shared__ unsigned int s_uint[];
} return s_uint;
}
}; };
template <> template <> struct SharedMemory<char>
struct SharedMemory<char> { {
__device__ char *getPointer() { __device__ char *getPointer()
extern __shared__ char s_char[]; {
return s_char; extern __shared__ char s_char[];
} return s_char;
}
}; };
template <> template <> struct SharedMemory<unsigned char>
struct SharedMemory<unsigned char> { {
__device__ unsigned char *getPointer() { __device__ unsigned char *getPointer()
extern __shared__ unsigned char s_uchar[]; {
return s_uchar; extern __shared__ unsigned char s_uchar[];
} return s_uchar;
}
}; };
template <> template <> struct SharedMemory<short>
struct SharedMemory<short> { {
__device__ short *getPointer() { __device__ short *getPointer()
extern __shared__ short s_short[]; {
return s_short; extern __shared__ short s_short[];
} return s_short;
}
}; };
template <> template <> struct SharedMemory<unsigned short>
struct SharedMemory<unsigned short> { {
__device__ unsigned short *getPointer() { __device__ unsigned short *getPointer()
extern __shared__ unsigned short s_ushort[]; {
return s_ushort; extern __shared__ unsigned short s_ushort[];
} return s_ushort;
}
}; };
template <> template <> struct SharedMemory<long>
struct SharedMemory<long> { {
__device__ long *getPointer() { __device__ long *getPointer()
extern __shared__ long s_long[]; {
return s_long; extern __shared__ long s_long[];
} return s_long;
}
}; };
template <> template <> struct SharedMemory<unsigned long>
struct SharedMemory<unsigned long> { {
__device__ unsigned long *getPointer() { __device__ unsigned long *getPointer()
extern __shared__ unsigned long s_ulong[]; {
return s_ulong; extern __shared__ unsigned long s_ulong[];
} return s_ulong;
}
}; };
template <> template <> struct SharedMemory<bool>
struct SharedMemory<bool> { {
__device__ bool *getPointer() { __device__ bool *getPointer()
extern __shared__ bool s_bool[]; {
return s_bool; extern __shared__ bool s_bool[];
} return s_bool;
}
}; };
template <> template <> struct SharedMemory<float>
struct SharedMemory<float> { {
__device__ float *getPointer() { __device__ float *getPointer()
extern __shared__ float s_float[]; {
return s_float; extern __shared__ float s_float[];
} return s_float;
}
}; };
template <> template <> struct SharedMemory<double>
struct SharedMemory<double> { {
__device__ double *getPointer() { __device__ double *getPointer()
extern __shared__ double s_double[]; {
return s_double; extern __shared__ double s_double[];
} return s_double;
}
}; };
#endif //_SHAREDMEM_H_ #endif //_SHAREDMEM_H_

View File

@ -26,23 +26,23 @@
*/ */
/* This sample is a templatized version of the template project. /* This sample is a templatized version of the template project.
* It also shows how to correctly templatize dynamically allocated shared * It also shows how to correctly templatize dynamically allocated shared
* memory arrays. * memory arrays.
* Host code. * Host code.
*/ */
// System includes // System includes
#include <stdio.h>
#include <assert.h> #include <assert.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <string.h>
// CUDA runtime // CUDA runtime
#include <cuda_runtime.h> #include <cuda_runtime.h>
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h>
#ifndef MAX #ifndef MAX
#define MAX(a, b) (a > b ? a : b) #define MAX(a, b) (a > b ? a : b)
@ -58,55 +58,55 @@ int g_TotalFailures = 0;
//! @param g_idata input data in global memory //! @param g_idata input data in global memory
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class T> template <class T> __global__ void testKernel(T *g_idata, T *g_odata)
__global__ void testKernel(T *g_idata, T *g_odata) { {
// Shared mem size is determined by the host app at run time // Shared mem size is determined by the host app at run time
SharedMemory<T> smem; SharedMemory<T> smem;
T *sdata = smem.getPointer(); T *sdata = smem.getPointer();
// access thread id // access thread id
const unsigned int tid = threadIdx.x; const unsigned int tid = threadIdx.x;
// access number of threads in this block // access number of threads in this block
const unsigned int num_threads = blockDim.x; const unsigned int num_threads = blockDim.x;
// read in input data from global memory // read in input data from global memory
sdata[tid] = g_idata[tid]; sdata[tid] = g_idata[tid];
__syncthreads(); __syncthreads();
// perform some computations // perform some computations
sdata[tid] = (T)num_threads * sdata[tid]; sdata[tid] = (T)num_threads * sdata[tid];
__syncthreads(); __syncthreads();
// write data to global memory // write data to global memory
g_odata[tid] = sdata[tid]; g_odata[tid] = sdata[tid];
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// declaration, forward // declaration, forward
template <class T> template <class T> void runTest(int argc, char **argv, int len);
void runTest(int argc, char **argv, int len);
template <class T> template <class T> void computeGold(T *reference, T *idata, const unsigned int len)
void computeGold(T *reference, T *idata, const unsigned int len) { {
const T T_len = static_cast<T>(len); const T T_len = static_cast<T>(len);
for (unsigned int i = 0; i < len; ++i) { for (unsigned int i = 0; i < len; ++i) {
reference[i] = idata[i] * T_len; reference[i] = idata[i] * T_len;
} }
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("> runTest<float,32>\n"); {
runTest<float>(argc, argv, 32); printf("> runTest<float,32>\n");
printf("> runTest<int,64>\n"); runTest<float>(argc, argv, 32);
runTest<int>(argc, argv, 64); printf("> runTest<int,64>\n");
runTest<int>(argc, argv, 64);
printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures); printf("\n[simpleTemplates] -> Test Results: %d Failures\n", g_TotalFailures);
exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE); exit(g_TotalFailures == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
} }
// To completely templatize runTest (below) with cutil, we need to use // To completely templatize runTest (below) with cutil, we need to use
@ -114,151 +114,152 @@ int main(int argc, char **argv) {
// functions for different types. // functions for different types.
// Here's the generic wrapper for cutCompare* // Here's the generic wrapper for cutCompare*
template <class T> template <class T> class ArrayComparator
class ArrayComparator { {
public: public:
bool compare(const T *reference, T *data, unsigned int len) { bool compare(const T *reference, T *data, unsigned int len)
fprintf(stderr, {
"Error: no comparison function implemented for this type\n"); fprintf(stderr, "Error: no comparison function implemented for this type\n");
return false; return false;
} }
}; };
// Here's the specialization for ints: // Here's the specialization for ints:
template <> template <> class ArrayComparator<int>
class ArrayComparator<int> { {
public: public:
bool compare(const int *reference, int *data, unsigned int len) { bool compare(const int *reference, int *data, unsigned int len)
return compareData(reference, data, len, 0.15f, 0.0f); {
} return compareData(reference, data, len, 0.15f, 0.0f);
}
}; };
// Here's the specialization for floats: // Here's the specialization for floats:
template <> template <> class ArrayComparator<float>
class ArrayComparator<float> { {
public: public:
bool compare(const float *reference, float *data, unsigned int len) { bool compare(const float *reference, float *data, unsigned int len)
return compareData(reference, data, len, 0.15f, 0.15f); {
} return compareData(reference, data, len, 0.15f, 0.15f);
}
}; };
// Here's the generic wrapper for cutWriteFile* // Here's the generic wrapper for cutWriteFile*
template <class T> template <class T> class ArrayFileWriter
class ArrayFileWriter { {
public: public:
bool write(const char *filename, T *data, unsigned int len, float epsilon) { bool write(const char *filename, T *data, unsigned int len, float epsilon)
fprintf(stderr, {
"Error: no file write function implemented for this type\n"); fprintf(stderr, "Error: no file write function implemented for this type\n");
return false; return false;
} }
}; };
// Here's the specialization for ints: // Here's the specialization for ints:
template <> template <> class ArrayFileWriter<int>
class ArrayFileWriter<int> { {
public: public:
bool write(const char *filename, int *data, unsigned int len, float epsilon) { bool write(const char *filename, int *data, unsigned int len, float epsilon)
return sdkWriteFile(filename, data, len, epsilon, false); {
} return sdkWriteFile(filename, data, len, epsilon, false);
}
}; };
// Here's the specialization for floats: // Here's the specialization for floats:
template <> template <> class ArrayFileWriter<float>
class ArrayFileWriter<float> { {
public: public:
bool write(const char *filename, float *data, unsigned int len, bool write(const char *filename, float *data, unsigned int len, float epsilon)
float epsilon) { {
return sdkWriteFile(filename, data, len, epsilon, false); return sdkWriteFile(filename, data, len, epsilon, false);
} }
}; };
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
template <class T> template <class T> void runTest(int argc, char **argv, int len)
void runTest(int argc, char **argv, int len) { {
int devID; int devID;
cudaDeviceProp deviceProps; cudaDeviceProp deviceProps;
devID = findCudaDevice(argc, (const char **)argv); devID = findCudaDevice(argc, (const char **)argv);
// get number of SMs on this GPU // get number of SMs on this GPU
checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProps, devID));
printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, printf("CUDA device [%s] has %d Multi-Processors\n", deviceProps.name, deviceProps.multiProcessorCount);
deviceProps.multiProcessorCount);
// create and start timer // create and start timer
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
// start the timer // start the timer
sdkStartTimer(&timer); sdkStartTimer(&timer);
unsigned int num_threads = len; unsigned int num_threads = len;
unsigned int mem_size = sizeof(float) * num_threads; unsigned int mem_size = sizeof(float) * num_threads;
// allocate host memory // allocate host memory
T *h_idata = (T *)malloc(mem_size); T *h_idata = (T *)malloc(mem_size);
// initialize the memory // initialize the memory
for (unsigned int i = 0; i < num_threads; ++i) { for (unsigned int i = 0; i < num_threads; ++i) {
h_idata[i] = (T)i; h_idata[i] = (T)i;
} }
// allocate device memory // allocate device memory
T *d_idata; T *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
// copy host memory to device // copy host memory to device
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
// allocate device memory for result // allocate device memory for result
T *d_odata; T *d_odata;
checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
// setup execution parameters // setup execution parameters
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1); dim3 threads(num_threads, 1, 1);
// execute the kernel // execute the kernel
testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata); testKernel<T><<<grid, threads, mem_size>>>(d_idata, d_odata);
// check if kernel execution generated and error // check if kernel execution generated and error
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
// allocate mem for the result on host side // allocate mem for the result on host side
T *h_odata = (T *)malloc(mem_size); T *h_odata = (T *)malloc(mem_size);
// copy result from device to host // copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(T) * num_threads, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// compute reference solution // compute reference solution
T *reference = (T *)malloc(mem_size); T *reference = (T *)malloc(mem_size);
computeGold<T>(reference, h_idata, num_threads); computeGold<T>(reference, h_idata, num_threads);
ArrayComparator<T> comparator; ArrayComparator<T> comparator;
ArrayFileWriter<T> writer; ArrayFileWriter<T> writer;
// check result // check result
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
writer.write("./data/regression.dat", h_odata, num_threads, 0.0f); writer.write("./data/regression.dat", h_odata, num_threads, 0.0f);
} else { }
// custom output handling when no regression test running else {
// in this case check if the result is equivalent to the expected solution // custom output handling when no regression test running
bool res = comparator.compare(reference, h_odata, num_threads); // in this case check if the result is equivalent to the expected solution
printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH"); bool res = comparator.compare(reference, h_odata, num_threads);
g_TotalFailures += (1 != res); printf("Compare %s\n\n", (1 == res) ? "OK" : "MISMATCH");
} g_TotalFailures += (1 != res);
}
// cleanup memory // cleanup memory
free(h_idata); free(h_idata);
free(h_odata); free(h_odata);
free(reference); free(reference);
checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata)); checkCudaErrors(cudaFree(d_odata));
} }

View File

@ -34,10 +34,10 @@
*/ */
// Includes, system // Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef _WIN32 #ifdef _WIN32
#define WINDOWS_LEAN_AND_MEAN #define WINDOWS_LEAN_AND_MEAN
@ -49,22 +49,22 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
// Utilities and timing functions // Utilities and timing functions
#include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h #include <helper_functions.h> // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions // CUDA helper functions
#include <helper_cuda.h> // helper functions for CUDA error check #include <helper_cuda.h> // helper functions for CUDA error check
#define MAX_EPSILON_ERROR 5e-3f #define MAX_EPSILON_ERROR 5e-3f
// Define the files that are to be save and the reference images for validation // Define the files that are to be save and the reference images for validation
const char *imageFilename = "teapot512.pgm"; const char *imageFilename = "teapot512.pgm";
const char *refFilename = "ref_rotated.pgm"; const char *refFilename = "ref_rotated.pgm";
const char *sampleName = "simpleTexture"; const char *sampleName = "simpleTexture";
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Constants // Constants
const float angle = 0.5f; // angle to rotate image by (in radians) const float angle = 0.5f; // angle to rotate image by (in radians)
// Auto-Verification Code // Auto-Verification Code
bool testResult = true; bool testResult = true;
@ -73,22 +73,22 @@ bool testResult = true;
//! Transform an image using texture lookups //! Transform an image using texture lookups
//! @param outputData output data in global memory //! @param outputData output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void transformKernel(float *outputData, int width, int height, __global__ void transformKernel(float *outputData, int width, int height, float theta, cudaTextureObject_t tex)
float theta, cudaTextureObject_t tex) { {
// calculate normalized texture coordinates // calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = (float)x - (float)width / 2; float u = (float)x - (float)width / 2;
float v = (float)y - (float)height / 2; float v = (float)y - (float)height / 2;
float tu = u * cosf(theta) - v * sinf(theta); float tu = u * cosf(theta) - v * sinf(theta);
float tv = v * cosf(theta) + u * sinf(theta); float tv = v * cosf(theta) + u * sinf(theta);
tu /= (float)width; tu /= (float)width;
tv /= (float)height; tv /= (float)height;
// read from texture and write to global memory // read from texture and write to global memory
outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f); outputData[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -98,154 +98,151 @@ void runTest(int argc, char **argv);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("%s starting...\n", sampleName); {
printf("%s starting...\n", sampleName);
// Process command-line arguments // Process command-line arguments
if (argc > 1) { if (argc > 1) {
if (checkCmdLineFlag(argc, (const char **)argv, "input")) { if (checkCmdLineFlag(argc, (const char **)argv, "input")) {
getCmdLineArgumentString(argc, (const char **)argv, "input", getCmdLineArgumentString(argc, (const char **)argv, "input", (char **)&imageFilename);
(char **)&imageFilename);
if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
getCmdLineArgumentString(argc, (const char **)argv, "reference", getCmdLineArgumentString(argc, (const char **)argv, "reference", (char **)&refFilename);
(char **)&refFilename); }
} else { else {
printf("-input flag should be used with -reference flag"); printf("-input flag should be used with -reference flag");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) { }
printf("-reference flag should be used with -input flag"); else if (checkCmdLineFlag(argc, (const char **)argv, "reference")) {
exit(EXIT_FAILURE); printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
} }
}
runTest(argc, argv); runTest(argc, argv);
printf("%s completed, returned %s\n", sampleName, printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
int devID = findCudaDevice(argc, (const char **)argv); {
int devID = findCudaDevice(argc, (const char **)argv);
// load image from disk // load image from disk
float *hData = NULL; float *hData = NULL;
unsigned int width, height; unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]); char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL) { if (imagePath == NULL) {
printf("Unable to source image file: %s\n", imageFilename); printf("Unable to source image file: %s\n", imageFilename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
sdkLoadPGM(imagePath, &hData, &width, &height); sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float); unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height); printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Load reference image from image (output) // Load reference image from image (output)
float *hDataRef = (float *)malloc(size); float *hDataRef = (float *)malloc(size);
char *refPath = sdkFindFilePath(refFilename, argv[0]); char *refPath = sdkFindFilePath(refFilename, argv[0]);
if (refPath == NULL) { if (refPath == NULL) {
printf("Unable to find reference image file: %s\n", refFilename); printf("Unable to find reference image file: %s\n", refFilename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
sdkLoadPGM(refPath, &hDataRef, &width, &height); sdkLoadPGM(refPath, &hDataRef, &width, &height);
// Allocate device memory for result // Allocate device memory for result
float *dData = NULL; float *dData = NULL;
checkCudaErrors(cudaMalloc((void **)&dData, size)); checkCudaErrors(cudaMalloc((void **)&dData, size));
// Allocate array and copy image data // Allocate array and copy image data
cudaChannelFormatDesc channelDesc = cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaArray *cuArray;
cudaArray *cuArray; checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height));
checkCudaErrors(cudaMallocArray(&cuArray, &channelDesc, width, height)); checkCudaErrors(cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(
cudaMemcpyToArray(cuArray, 0, 0, hData, size, cudaMemcpyHostToDevice));
cudaTextureObject_t tex; cudaTextureObject_t tex;
cudaResourceDesc texRes; cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc)); memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray; texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = cuArray; texRes.res.array.array = cuArray;
cudaTextureDesc texDescr; cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc)); memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true; texDescr.normalizedCoords = true;
texDescr.filterMode = cudaFilterModeLinear; texDescr.filterMode = cudaFilterModeLinear;
texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap; texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeElementType; texDescr.readMode = cudaReadModeElementType;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
dim3 dimBlock(8, 8, 1); dim3 dimBlock(8, 8, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1); dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
// Warmup // Warmup
transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex); transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
// Execute the kernel // Execute the kernel
transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex); transformKernel<<<dimGrid, dimBlock, 0>>>(dData, width, height, angle, tex);
// Check if kernel execution generated an error // Check if kernel execution generated an error
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n", printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); sdkDeleteTimer(&timer);
sdkDeleteTimer(&timer);
// Allocate mem for the result on host side // Allocate mem for the result on host side
float *hOutputData = (float *)malloc(size); float *hOutputData = (float *)malloc(size);
// copy result from device to host // copy result from device to host
checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(hOutputData, dData, size, cudaMemcpyDeviceToHost));
// Write result to file // Write result to file
char outputFilename[1024]; char outputFilename[1024];
strcpy(outputFilename, imagePath); strcpy(outputFilename, imagePath);
strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm"); strcpy(outputFilename + strlen(imagePath) - 4, "_out.pgm");
sdkSavePGM(outputFilename, hOutputData, width, height); sdkSavePGM(outputFilename, hOutputData, width, height);
printf("Wrote '%s'\n", outputFilename); printf("Wrote '%s'\n", outputFilename);
// Write regression file if necessary // Write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// Write file for regression test // Write file for regression test
sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, sdkWriteFile<float>("./data/regression.dat", hOutputData, width * height, 0.0f, false);
0.0f, false); }
} else { else {
// We need to reload the data from disk, // We need to reload the data from disk,
// because it is inverted upon output // because it is inverted upon output
sdkLoadPGM(outputFilename, &hOutputData, &width, &height); sdkLoadPGM(outputFilename, &hOutputData, &width, &height);
printf("Comparing files\n"); printf("Comparing files\n");
printf("\toutput: <%s>\n", outputFilename); printf("\toutput: <%s>\n", outputFilename);
printf("\treference: <%s>\n", refPath); printf("\treference: <%s>\n", refPath);
testResult = compareData(hOutputData, hDataRef, width * height, testResult = compareData(hOutputData, hDataRef, width * height, MAX_EPSILON_ERROR, 0.15f);
MAX_EPSILON_ERROR, 0.15f); }
}
checkCudaErrors(cudaDestroyTextureObject(tex)); checkCudaErrors(cudaDestroyTextureObject(tex));
checkCudaErrors(cudaFree(dData)); checkCudaErrors(cudaFree(dData));
checkCudaErrors(cudaFreeArray(cuArray)); checkCudaErrors(cudaFreeArray(cuArray));
free(imagePath); free(imagePath);
free(refPath); free(refPath);
} }

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -32,11 +32,11 @@
using 3D texture lookups. using 3D texture lookups.
*/ */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_gl.h> #include <helper_gl.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
#pragma clang diagnostic ignored "-Wdeprecated-declarations" #pragma clang diagnostic ignored "-Wdeprecated-declarations"
@ -49,53 +49,52 @@
#endif #endif
// includes, cuda // includes, cuda
#include <vector_types.h>
#include <cuda_runtime.h>
#include <cuda_gl_interop.h> #include <cuda_gl_interop.h>
#include <cuda_runtime.h>
#include <vector_types.h>
// CUDA utilities and system includes // CUDA utilities and system includes
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> #include <helper_functions.h>
#include <vector_types.h> #include <vector_types.h>
typedef unsigned int uint; typedef unsigned int uint;
typedef unsigned char uchar; typedef unsigned char uchar;
#define MAX_EPSILON_ERROR 5.0f #define MAX_EPSILON_ERROR 5.0f
#define THRESHOLD 0.15f #define THRESHOLD 0.15f
const char *sSDKsample = "simpleTexture3D"; const char *sSDKsample = "simpleTexture3D";
const char *volumeFilename = "Bucky.raw"; const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32); const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
const uint width = 512, height = 512; const uint width = 512, height = 512;
const dim3 blockSize(16, 16, 1); const dim3 blockSize(16, 16, 1);
const dim3 gridSize(width / blockSize.x, height / blockSize.y); const dim3 gridSize(width / blockSize.x, height / blockSize.y);
float w = 0.5; // texture coordinate in z float w = 0.5; // texture coordinate in z
GLuint pbo; // OpenGL pixel buffer object GLuint pbo; // OpenGL pixel buffer object
struct cudaGraphicsResource struct cudaGraphicsResource *cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
*cuda_pbo_resource; // CUDA Graphics Resource (to transfer PBO)
bool linearFiltering = true; bool linearFiltering = true;
bool animate = true; bool animate = true;
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
uint *d_output = NULL; uint *d_output = NULL;
// Auto-Verification Code // Auto-Verification Code
const int frameCheckNumber = 4; const int frameCheckNumber = 4;
int fpsCount = 0; // FPS count for averaging int fpsCount = 0; // FPS count for averaging
int fpsLimit = 1; // FPS limit for sampling int fpsLimit = 1; // FPS limit for sampling
int g_Index = 0; int g_Index = 0;
unsigned int frameCount = 0; unsigned int frameCount = 0;
unsigned int g_TotalErrors = 0; unsigned int g_TotalErrors = 0;
volatile int g_GraphicsMapFlag = 0; volatile int g_GraphicsMapFlag = 0;
int *pArgc = NULL; int *pArgc = NULL;
char **pArgv = NULL; char **pArgv = NULL;
#ifndef MAX #ifndef MAX
@ -105,288 +104,294 @@ char **pArgv = NULL;
extern "C" void cleanup(); extern "C" void cleanup();
extern "C" void setTextureFilterMode(bool bLinearFilter); extern "C" void setTextureFilterMode(bool bLinearFilter);
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize); extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w);
uint imageW, uint imageH, float w); extern void cleanupCuda();
extern void cleanupCuda();
void loadVolumeData(char *exec_path); void loadVolumeData(char *exec_path);
void computeFPS() { void computeFPS()
frameCount++; {
fpsCount++; frameCount++;
fpsCount++;
if (fpsCount == fpsLimit) { if (fpsCount == fpsLimit) {
char fps[256]; char fps[256];
float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps); sprintf(fps, "%s: %3.1f fps", sSDKsample, ifps);
glutSetWindowTitle(fps); glutSetWindowTitle(fps);
fpsCount = 0; fpsCount = 0;
fpsLimit = ftoi(MAX(1.0f, ifps)); fpsLimit = ftoi(MAX(1.0f, ifps));
sdkResetTimer(&timer); sdkResetTimer(&timer);
} }
} }
// render image using CUDA // render image using CUDA
void render() { void render()
// map PBO to get CUDA device pointer {
g_GraphicsMapFlag++; // map PBO to get CUDA device pointer
checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0)); g_GraphicsMapFlag++;
size_t num_bytes; checkCudaErrors(cudaGraphicsMapResources(1, &cuda_pbo_resource, 0));
checkCudaErrors(cudaGraphicsResourceGetMappedPointer( size_t num_bytes;
(void **)&d_output, &num_bytes, cuda_pbo_resource)); checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_output, &num_bytes, cuda_pbo_resource));
// printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes); // printf("CUDA mapped PBO: May access %ld bytes\n", num_bytes);
// call CUDA kernel, writing results to PBO // call CUDA kernel, writing results to PBO
render_kernel(gridSize, blockSize, d_output, width, height, w); render_kernel(gridSize, blockSize, d_output, width, height, w);
getLastCudaError("render_kernel failed"); getLastCudaError("render_kernel failed");
if (g_GraphicsMapFlag) { if (g_GraphicsMapFlag) {
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
g_GraphicsMapFlag--; g_GraphicsMapFlag--;
} }
} }
// display results using OpenGL (called by GLUT) // display results using OpenGL (called by GLUT)
void display() { void display()
sdkStartTimer(&timer); {
sdkStartTimer(&timer);
render(); render();
// display results // display results
glClear(GL_COLOR_BUFFER_BIT); glClear(GL_COLOR_BUFFER_BIT);
// draw image from PBO // draw image from PBO
glDisable(GL_DEPTH_TEST); glDisable(GL_DEPTH_TEST);
glRasterPos2i(0, 0); glRasterPos2i(0, 0);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); glDrawPixels(width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
glutSwapBuffers(); glutSwapBuffers();
glutReportErrors(); glutReportErrors();
sdkStopTimer(&timer); sdkStopTimer(&timer);
computeFPS(); computeFPS();
} }
void idle() { void idle()
if (animate) { {
w += 0.01f; if (animate) {
glutPostRedisplay(); w += 0.01f;
} glutPostRedisplay();
}
} }
void keyboard(unsigned char key, int x, int y) { void keyboard(unsigned char key, int x, int y)
switch (key) { {
switch (key) {
case 27: case 27:
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
glutDestroyWindow(glutGetWindow()); glutDestroyWindow(glutGetWindow());
return; return;
#else #else
glutDestroyWindow(glutGetWindow()); glutDestroyWindow(glutGetWindow());
return; return;
#endif #endif
case '=': case '=':
case '+': case '+':
w += 0.01f; w += 0.01f;
break; break;
case '-': case '-':
w -= 0.01f; w -= 0.01f;
break; break;
case 'f': case 'f':
linearFiltering = !linearFiltering; linearFiltering = !linearFiltering;
setTextureFilterMode(linearFiltering); setTextureFilterMode(linearFiltering);
break; break;
case ' ': case ' ':
animate = !animate; animate = !animate;
break; break;
default: default:
break; break;
} }
glutPostRedisplay(); glutPostRedisplay();
} }
void reshape(int x, int y) { void reshape(int x, int y)
glViewport(0, 0, x, y); {
glViewport(0, 0, x, y);
glMatrixMode(GL_MODELVIEW); glMatrixMode(GL_MODELVIEW);
glLoadIdentity(); glLoadIdentity();
glMatrixMode(GL_PROJECTION); glMatrixMode(GL_PROJECTION);
glLoadIdentity(); glLoadIdentity();
glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
} }
void cleanup() { void cleanup()
sdkDeleteTimer(&timer); {
sdkDeleteTimer(&timer);
// add extra check to unmap the resource before unregistering it // add extra check to unmap the resource before unregistering it
if (g_GraphicsMapFlag) { if (g_GraphicsMapFlag) {
checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0)); checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_pbo_resource, 0));
g_GraphicsMapFlag--; g_GraphicsMapFlag--;
} }
// unregister this buffer object from CUDA C // unregister this buffer object from CUDA C
checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource)); checkCudaErrors(cudaGraphicsUnregisterResource(cuda_pbo_resource));
glDeleteBuffers(1, &pbo); glDeleteBuffers(1, &pbo);
cleanupCuda(); cleanupCuda();
} }
void initGLBuffers() { void initGLBuffers()
// create pixel buffer object {
glGenBuffers(1, &pbo); // create pixel buffer object
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); glGenBuffers(1, &pbo);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
0, GL_STREAM_DRAW_ARB); glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, 0, GL_STREAM_DRAW_ARB);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
// register this buffer object with CUDA // register this buffer object with CUDA
checkCudaErrors(cudaGraphicsGLRegisterBuffer( checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
&cuda_pbo_resource, pbo, cudaGraphicsMapFlagsWriteDiscard));
} }
// Load raw data from disk // Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size) { uchar *loadRawFile(const char *filename, size_t size)
FILE *fp = fopen(filename, "rb"); {
FILE *fp = fopen(filename, "rb");
if (!fp) { if (!fp) {
fprintf(stderr, "Error opening file '%s'\n", filename); fprintf(stderr, "Error opening file '%s'\n", filename);
return 0; return 0;
} }
uchar *data = (uchar *)malloc(size); uchar *data = (uchar *)malloc(size);
size_t read = fread(data, 1, size, fp); size_t read = fread(data, 1, size, fp);
fclose(fp); fclose(fp);
printf("Read '%s', %zu bytes\n", filename, read); printf("Read '%s', %zu bytes\n", filename, read);
return data; return data;
} }
void initGL(int *argc, char **argv) { void initGL(int *argc, char **argv)
// initialize GLUT callback functions {
glutInit(argc, argv); // initialize GLUT callback functions
glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); glutInit(argc, argv);
glutInitWindowSize(width, height); glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
glutCreateWindow("CUDA 3D texture"); glutInitWindowSize(width, height);
glutDisplayFunc(display); glutCreateWindow("CUDA 3D texture");
glutKeyboardFunc(keyboard); glutDisplayFunc(display);
glutReshapeFunc(reshape); glutKeyboardFunc(keyboard);
glutIdleFunc(idle); glutReshapeFunc(reshape);
glutIdleFunc(idle);
if (!isGLVersionSupported(2, 0) || if (!isGLVersionSupported(2, 0) || !areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) {
!areGLExtensionsSupported("GL_ARB_pixel_buffer_object")) { fprintf(stderr, "Required OpenGL extensions are missing.");
fprintf(stderr, "Required OpenGL extensions are missing."); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
} }
void runAutoTest(const char *ref_file, char *exec_path) { void runAutoTest(const char *ref_file, char *exec_path)
checkCudaErrors( {
cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4)); checkCudaErrors(cudaMalloc((void **)&d_output, width * height * sizeof(GLubyte) * 4));
// render the volumeData // render the volumeData
render_kernel(gridSize, blockSize, d_output, width, height, w); render_kernel(gridSize, blockSize, d_output, width, height, w);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("render_kernel failed"); getLastCudaError("render_kernel failed");
void *h_output = malloc(width * height * sizeof(GLubyte) * 4); void *h_output = malloc(width * height * sizeof(GLubyte) * 4);
checkCudaErrors(cudaMemcpy(h_output, d_output, checkCudaErrors(cudaMemcpy(h_output, d_output, width * height * sizeof(GLubyte) * 4, cudaMemcpyDeviceToHost));
width * height * sizeof(GLubyte) * 4, sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4, "simpleTexture3D.bin");
cudaMemcpyDeviceToHost));
sdkDumpBin(h_output, width * height * sizeof(GLubyte) * 4,
"simpleTexture3D.bin");
bool bTestResult = sdkCompareBin2BinFloat( bool bTestResult = sdkCompareBin2BinFloat("simpleTexture3D.bin",
"simpleTexture3D.bin", sdkFindFilePath(ref_file, exec_path), sdkFindFilePath(ref_file, exec_path),
width * height, MAX_EPSILON_ERROR, THRESHOLD, exec_path); width * height,
MAX_EPSILON_ERROR,
THRESHOLD,
exec_path);
checkCudaErrors(cudaFree(d_output)); checkCudaErrors(cudaFree(d_output));
free(h_output); free(h_output);
sdkStopTimer(&timer); sdkStopTimer(&timer);
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }
void loadVolumeData(char *exec_path) { void loadVolumeData(char *exec_path)
// load volume data {
const char *path = sdkFindFilePath(volumeFilename, exec_path); // load volume data
const char *path = sdkFindFilePath(volumeFilename, exec_path);
if (path == NULL) { if (path == NULL) {
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
volumeFilename); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
size_t size = volumeSize.width * volumeSize.height * volumeSize.depth; size_t size = volumeSize.width * volumeSize.height * volumeSize.depth;
uchar *h_volume = loadRawFile(path, size); uchar *h_volume = loadRawFile(path, size);
initCuda(h_volume, volumeSize); initCuda(h_volume, volumeSize);
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
free(h_volume); free(h_volume);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
pArgc = &argc; {
pArgv = argv; pArgc = &argc;
pArgv = argv;
char *ref_file = NULL; char *ref_file = NULL;
#if defined(__linux__) #if defined(__linux__)
setenv("DISPLAY", ":0", 0); setenv("DISPLAY", ":0", 0);
#endif #endif
printf("%s Starting...\n\n", sSDKsample); printf("%s Starting...\n\n", sSDKsample);
if (checkCmdLineFlag(argc, (const char **)argv, "file")) { if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
fpsLimit = frameCheckNumber; fpsLimit = frameCheckNumber;
getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
} }
// use command-line specified CUDA device, otherwise use device with highest // use command-line specified CUDA device, otherwise use device with highest
// Gflops/s // Gflops/s
findCudaDevice(argc, (const char **)argv); findCudaDevice(argc, (const char **)argv);
if (ref_file) { if (ref_file) {
loadVolumeData(argv[0]); loadVolumeData(argv[0]);
runAutoTest(ref_file, argv[0]); runAutoTest(ref_file, argv[0]);
} else { }
initGL(&argc, argv); else {
initGL(&argc, argv);
// OpenGL buffers // OpenGL buffers
initGLBuffers(); initGLBuffers();
loadVolumeData(argv[0]); loadVolumeData(argv[0]);
} }
printf( printf("Press space to toggle animation\n"
"Press space to toggle animation\n" "Press '+' and '-' to change displayed slice\n");
"Press '+' and '-' to change displayed slice\n");
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
atexit(cleanup); atexit(cleanup);
#else #else
glutCloseFunc(cleanup); glutCloseFunc(cleanup);
#endif #endif
glutMainLoop(); glutMainLoop();
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }

View File

@ -28,111 +28,111 @@
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_ #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_ #define _SIMPLETEXTURE3D_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_math.h> #include <helper_math.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef unsigned int uint; typedef unsigned int uint;
typedef unsigned char uchar; typedef unsigned char uchar;
cudaArray *d_volumeArray = 0; cudaArray *d_volumeArray = 0;
cudaTextureObject_t tex; // 3D texture cudaTextureObject_t tex; // 3D texture
__global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, __global__ void d_render(uint *d_output, uint imageW, uint imageH, float w, cudaTextureObject_t texObj)
cudaTextureObject_t texObj) { {
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x; uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y; uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
float u = x / (float)imageW; float u = x / (float)imageW;
float v = y / (float)imageH; float v = y / (float)imageH;
// read from 3D texture // read from 3D texture
float voxel = tex3D<float>(texObj, u, v, w); float voxel = tex3D<float>(texObj, u, v, w);
if ((x < imageW) && (y < imageH)) { if ((x < imageW) && (y < imageH)) {
// write output color // write output color
uint i = __umul24(y, imageW) + x; uint i = __umul24(y, imageW) + x;
d_output[i] = voxel * 255; d_output[i] = voxel * 255;
} }
} }
extern "C" void setTextureFilterMode(bool bLinearFilter) { extern "C" void setTextureFilterMode(bool bLinearFilter)
if (tex) { {
checkCudaErrors(cudaDestroyTextureObject(tex)); if (tex) {
} checkCudaErrors(cudaDestroyTextureObject(tex));
cudaResourceDesc texRes; }
memset(&texRes, 0, sizeof(cudaResourceDesc)); cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray; texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_volumeArray; texRes.res.array.array = d_volumeArray;
cudaTextureDesc texDescr; cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc)); memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = true; texDescr.normalizedCoords = true;
texDescr.filterMode = texDescr.filterMode = bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint;
bLinearFilter ? cudaFilterModeLinear : cudaFilterModePoint; ;
; texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap; texDescr.addressMode[2] = cudaAddressModeWrap;
texDescr.addressMode[2] = cudaAddressModeWrap; texDescr.readMode = cudaReadModeNormalizedFloat;
texDescr.readMode = cudaReadModeNormalizedFloat;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
} }
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize) { extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize)
// create 3D array {
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>(); // create 3D array
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
// copy data to 3D array // copy data to 3D array
cudaMemcpy3DParms copyParams = {0}; cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = copyParams.srcPtr =
make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), make_cudaPitchedPtr((void *)h_volume, volumeSize.width * sizeof(uchar), volumeSize.width, volumeSize.height);
volumeSize.width, volumeSize.height); copyParams.dstArray = d_volumeArray;
copyParams.dstArray = d_volumeArray; copyParams.extent = volumeSize;
copyParams.extent = volumeSize; copyParams.kind = cudaMemcpyHostToDevice;
copyParams.kind = cudaMemcpyHostToDevice; checkCudaErrors(cudaMemcpy3D(&copyParams));
checkCudaErrors(cudaMemcpy3D(&copyParams));
cudaResourceDesc texRes; cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc)); memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray; texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_volumeArray; texRes.res.array.array = d_volumeArray;
cudaTextureDesc texDescr; cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc)); memset(&texDescr, 0, sizeof(cudaTextureDesc));
// access with normalized texture coordinates // access with normalized texture coordinates
texDescr.normalizedCoords = true; texDescr.normalizedCoords = true;
// linear interpolation // linear interpolation
texDescr.filterMode = cudaFilterModeLinear; texDescr.filterMode = cudaFilterModeLinear;
// wrap texture coordinates // wrap texture coordinates
texDescr.addressMode[0] = cudaAddressModeWrap; texDescr.addressMode[0] = cudaAddressModeWrap;
texDescr.addressMode[1] = cudaAddressModeWrap; texDescr.addressMode[1] = cudaAddressModeWrap;
texDescr.addressMode[2] = cudaAddressModeWrap; texDescr.addressMode[2] = cudaAddressModeWrap;
texDescr.readMode = cudaReadModeNormalizedFloat; texDescr.readMode = cudaReadModeNormalizedFloat;
checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL)); checkCudaErrors(cudaCreateTextureObject(&tex, &texRes, &texDescr, NULL));
} }
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, float w)
uint imageW, uint imageH, float w) { {
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex); d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, w, tex);
} }
void cleanupCuda() { void cleanupCuda()
if (tex) { {
checkCudaErrors(cudaDestroyTextureObject(tex)); if (tex) {
} checkCudaErrors(cudaDestroyTextureObject(tex));
if (d_volumeArray) { }
checkCudaErrors(cudaFreeArray(d_volumeArray)); if (d_volumeArray) {
} checkCudaErrors(cudaFreeArray(d_volumeArray));
}
} }
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_ #endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_

View File

@ -26,29 +26,29 @@
*/ */
/* /*
* This sample demonstrates how use texture fetches in CUDA * This sample demonstrates how use texture fetches in CUDA
* *
* This sample takes an input PGM image (image_filename) and generates * This sample takes an input PGM image (image_filename) and generates
* an output PGM image (image_filename_out). This CUDA kernel performs * an output PGM image (image_filename_out). This CUDA kernel performs
* a simple 2D transform (rotation) on the texture coordinates (u,v). * a simple 2D transform (rotation) on the texture coordinates (u,v).
* The results between simpleTexture and simpleTextureDrv are identical. * The results between simpleTexture and simpleTextureDrv are identical.
* The main difference is the implementation. simpleTextureDrv makes calls * The main difference is the implementation. simpleTextureDrv makes calls
* to the CUDA driver API and demonstrates how to use cuModuleLoad to load * to the CUDA driver API and demonstrates how to use cuModuleLoad to load
* the CUDA ptx (*.ptx) kernel just prior to kernel launch. * the CUDA ptx (*.ptx) kernel just prior to kernel launch.
* *
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <cstring> #include <cstring>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes, CUDA // includes, CUDA
#include <cuda.h>
#include <builtin_types.h> #include <builtin_types.h>
#include <cuda.h>
// includes, project // includes, project
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
#include <helper_functions.h> #include <helper_functions.h>
@ -56,8 +56,8 @@
using namespace std; using namespace std;
const char *image_filename = "teapot512.pgm"; const char *image_filename = "teapot512.pgm";
const char *ref_filename = "ref_rotated.pgm"; const char *ref_filename = "ref_rotated.pgm";
float angle = 0.5f; // angle to rotate image by (in radians) float angle = 0.5f; // angle to rotate image by (in radians)
#define MIN_EPSILON_ERROR 5e-3f #define MIN_EPSILON_ERROR 5e-3f
@ -65,8 +65,7 @@ float angle = 0.5f; // angle to rotate image by (in radians)
// declaration, forward // declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
static CUresult initCUDA(int argc, char **argv, CUfunction *); static CUresult initCUDA(int argc, char **argv, CUfunction *);
@ -80,212 +79,227 @@ const char *sSDKsample = "simpleTextureDrv (Driver API)";
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Globals // Globals
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
CUdevice cuDevice; CUdevice cuDevice;
CUcontext cuContext; CUcontext cuContext;
CUmodule cuModule; CUmodule cuModule;
void showHelp() { void showHelp()
printf("\n> [%s] Command line options\n", sSDKsample); {
printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n"); printf("\n> [%s] Command line options\n", sSDKsample);
printf("\t-device=n (where n=0,1,2.... for the GPU device)\n\n");
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
if (checkCmdLineFlag(argc, (const char **)argv, "help")) { {
showHelp(); if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
return 0; showHelp();
} return 0;
}
runTest(argc, argv); runTest(argc, argv);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
bool bTestResults = true; {
bool bTestResults = true;
// initialize CUDA // initialize CUDA
CUfunction transform = NULL; CUfunction transform = NULL;
if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) { if (initCUDA(argc, argv, &transform) != CUDA_SUCCESS) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// load image from disk // load image from disk
float *h_data = NULL; float *h_data = NULL;
unsigned int width, height; unsigned int width, height;
char *image_path = sdkFindFilePath(image_filename, argv[0]); char *image_path = sdkFindFilePath(image_filename, argv[0]);
if (image_path == NULL) { if (image_path == NULL) {
printf("Unable to find image file: '%s'\n", image_filename); printf("Unable to find image file: '%s'\n", image_filename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
sdkLoadPGM(image_path, &h_data, &width, &height); sdkLoadPGM(image_path, &h_data, &width, &height);
size_t size = width * height * sizeof(float); size_t size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height); printf("Loaded '%s', %d x %d pixels\n", image_filename, width, height);
// load reference image from image (output) // load reference image from image (output)
float *h_data_ref = (float *)malloc(size); float *h_data_ref = (float *)malloc(size);
char *ref_path = sdkFindFilePath(ref_filename, argv[0]); char *ref_path = sdkFindFilePath(ref_filename, argv[0]);
if (ref_path == NULL) { if (ref_path == NULL) {
printf("Unable to find reference file %s\n", ref_filename); printf("Unable to find reference file %s\n", ref_filename);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
sdkLoadPGM(ref_path, &h_data_ref, &width, &height); sdkLoadPGM(ref_path, &h_data_ref, &width, &height);
// allocate device memory for result // allocate device memory for result
CUdeviceptr d_data = (CUdeviceptr)NULL; CUdeviceptr d_data = (CUdeviceptr)NULL;
checkCudaErrors(cuMemAlloc(&d_data, size)); checkCudaErrors(cuMemAlloc(&d_data, size));
// allocate array and copy image data // allocate array and copy image data
CUarray cu_array; CUarray cu_array;
CUDA_ARRAY_DESCRIPTOR desc; CUDA_ARRAY_DESCRIPTOR desc;
desc.Format = CU_AD_FORMAT_FLOAT; desc.Format = CU_AD_FORMAT_FLOAT;
desc.NumChannels = 1; desc.NumChannels = 1;
desc.Width = width; desc.Width = width;
desc.Height = height; desc.Height = height;
checkCudaErrors(cuArrayCreate(&cu_array, &desc)); checkCudaErrors(cuArrayCreate(&cu_array, &desc));
CUDA_MEMCPY2D copyParam; CUDA_MEMCPY2D copyParam;
memset(&copyParam, 0, sizeof(copyParam)); memset(&copyParam, 0, sizeof(copyParam));
copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY; copyParam.dstMemoryType = CU_MEMORYTYPE_ARRAY;
copyParam.dstArray = cu_array; copyParam.dstArray = cu_array;
copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
copyParam.srcHost = h_data; copyParam.srcHost = h_data;
copyParam.srcPitch = width * sizeof(float); copyParam.srcPitch = width * sizeof(float);
copyParam.WidthInBytes = copyParam.srcPitch; copyParam.WidthInBytes = copyParam.srcPitch;
copyParam.Height = height; copyParam.Height = height;
checkCudaErrors(cuMemcpy2D(&copyParam)); checkCudaErrors(cuMemcpy2D(&copyParam));
// set texture parameters // set texture parameters
CUtexObject TexObject; CUtexObject TexObject;
CUDA_RESOURCE_DESC ResDesc; CUDA_RESOURCE_DESC ResDesc;
memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC)); memset(&ResDesc, 0, sizeof(CUDA_RESOURCE_DESC));
ResDesc.resType = CU_RESOURCE_TYPE_ARRAY; ResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
ResDesc.res.array.hArray = cu_array; ResDesc.res.array.hArray = cu_array;
CUDA_TEXTURE_DESC TexDesc; CUDA_TEXTURE_DESC TexDesc;
memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC)); memset(&TexDesc, 0, sizeof(CUDA_TEXTURE_DESC));
TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP; TexDesc.addressMode[0] = CU_TR_ADDRESS_MODE_WRAP;
TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP; TexDesc.addressMode[1] = CU_TR_ADDRESS_MODE_WRAP;
TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP; TexDesc.addressMode[2] = CU_TR_ADDRESS_MODE_WRAP;
TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR; TexDesc.filterMode = CU_TR_FILTER_MODE_LINEAR;
TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES; TexDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL)); checkCudaErrors(cuTexObjectCreate(&TexObject, &ResDesc, &TexDesc, NULL));
// There are two ways to launch CUDA kernels via the Driver API. // There are two ways to launch CUDA kernels via the Driver API.
// In this CUDA Sample, we illustrate both ways to pass parameters // In this CUDA Sample, we illustrate both ways to pass parameters
// and specify parameters. By default we use the simpler method. // and specify parameters. By default we use the simpler method.
int block_size = 8; int block_size = 8;
StopWatchInterface *timer = NULL; StopWatchInterface *timer = NULL;
if (1) { if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (simpler method) // Launching (simpler method)
void *args[5] = {&d_data, &width, &height, &angle, &TexObject}; void *args[5] = {&d_data, &width, &height, &angle, &TexObject};
checkCudaErrors(cuLaunchKernel(
transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
checkCudaErrors(cuCtxSynchronize());
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// launch kernel again for performance measurement
checkCudaErrors(cuLaunchKernel(
transform, (width / block_size), (height / block_size), 1, block_size, block_size, 1, 0, NULL, args, NULL));
}
else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
// pass in launch parameters (not actually de-referencing CUdeviceptr).
// CUdeviceptr is
// storing the value of the parameters
*((CUdeviceptr *)&argBuffer[offset]) = d_data;
offset += sizeof(d_data);
*((unsigned int *)&argBuffer[offset]) = width;
offset += sizeof(width);
*((unsigned int *)&argBuffer[offset]) = height;
offset += sizeof(height);
*((float *)&argBuffer[offset]) = angle;
offset += sizeof(angle);
*((CUtexObject *)&argBuffer[offset]) = TexObject;
offset += sizeof(TexObject);
void *kernel_launch_config[5] = {
CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, CU_LAUNCH_PARAM_END};
// new CUDA 4.0 Driver API Kernel launch call (warmup)
checkCudaErrors(cuLaunchKernel(transform,
(width / block_size),
(height / block_size),
1,
block_size,
block_size,
1,
0,
NULL,
NULL,
(void **)&kernel_launch_config));
checkCudaErrors(cuCtxSynchronize());
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// launch kernel again for performance measurement
checkCudaErrors(cuLaunchKernel(transform,
(width / block_size),
(height / block_size),
1,
block_size,
block_size,
1,
0,
0,
NULL,
(void **)&kernel_launch_config));
}
checkCudaErrors(cuLaunchKernel(transform, (width / block_size),
(height / block_size), 1, block_size,
block_size, 1, 0, NULL, args, NULL));
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
sdkCreateTimer(&timer); sdkStopTimer(&timer);
sdkStartTimer(&timer); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer);
// launch kernel again for performance measurement // allocate mem for the result on host side
checkCudaErrors(cuLaunchKernel(transform, (width / block_size), float *h_odata = (float *)malloc(size);
(height / block_size), 1, block_size, // copy result from device to host
block_size, 1, 0, NULL, args, NULL)); checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
} else {
// This is the new CUDA 4.0 API for Kernel Parameter passing and Kernel
// Launching (advanced method)
int offset = 0;
char argBuffer[256];
// pass in launch parameters (not actually de-referencing CUdeviceptr). // write result to file
// CUdeviceptr is char output_filename[1024];
// storing the value of the parameters strcpy(output_filename, image_path);
*((CUdeviceptr *)&argBuffer[offset]) = d_data; strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
offset += sizeof(d_data); sdkSavePGM(output_filename, h_odata, width, height);
*((unsigned int *)&argBuffer[offset]) = width; printf("Wrote '%s'\n", output_filename);
offset += sizeof(width);
*((unsigned int *)&argBuffer[offset]) = height;
offset += sizeof(height);
*((float *)&argBuffer[offset]) = angle;
offset += sizeof(angle);
*((CUtexObject *)&argBuffer[offset]) = TexObject;
offset += sizeof(TexObject);
void *kernel_launch_config[5] = {CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, // write regression file if necessary
CU_LAUNCH_PARAM_BUFFER_SIZE, &offset, if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
CU_LAUNCH_PARAM_END}; // write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f, false);
}
else {
// We need to reload the data from disk, because it is inverted upon output
sdkLoadPGM(output_filename, &h_odata, &width, &height);
// new CUDA 4.0 Driver API Kernel launch call (warmup) printf("Comparing files\n");
checkCudaErrors(cuLaunchKernel( printf("\toutput: <%s>\n", output_filename);
transform, (width / block_size), (height / block_size), 1, block_size, printf("\treference: <%s>\n", ref_path);
block_size, 1, 0, NULL, NULL, (void **)&kernel_launch_config)); bTestResults = compareData(h_odata, h_data_ref, width * height, MIN_EPSILON_ERROR, 0.15f);
checkCudaErrors(cuCtxSynchronize()); }
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
// launch kernel again for performance measurement // cleanup memory
checkCudaErrors(cuLaunchKernel( checkCudaErrors(cuTexObjectDestroy(TexObject));
transform, (width / block_size), (height / block_size), 1, block_size, checkCudaErrors(cuMemFree(d_data));
block_size, 1, 0, 0, NULL, (void **)&kernel_launch_config)); checkCudaErrors(cuArrayDestroy(cu_array));
}
checkCudaErrors(cuCtxSynchronize()); free(image_path);
sdkStopTimer(&timer); free(ref_path);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
printf("%.2f Mpixels/sec\n",
(width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
sdkDeleteTimer(&timer);
// allocate mem for the result on host side checkCudaErrors(cuCtxDestroy(cuContext));
float *h_odata = (float *)malloc(size);
// copy result from device to host
checkCudaErrors(cuMemcpyDtoH(h_odata, d_data, size));
// write result to file exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
char output_filename[1024];
strcpy(output_filename, image_path);
strcpy(output_filename + strlen(image_path) - 4, "_out.pgm");
sdkSavePGM(output_filename, h_odata, width, height);
printf("Wrote '%s'\n", output_filename);
// write regression file if necessary
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test
sdkWriteFile<float>("./data/regression.dat", h_odata, width * height, 0.0f,
false);
} else {
// We need to reload the data from disk, because it is inverted upon output
sdkLoadPGM(output_filename, &h_odata, &width, &height);
printf("Comparing files\n");
printf("\toutput: <%s>\n", output_filename);
printf("\treference: <%s>\n", ref_path);
bTestResults = compareData(h_odata, h_data_ref, width * height,
MIN_EPSILON_ERROR, 0.15f);
}
// cleanup memory
checkCudaErrors(cuTexObjectDestroy(TexObject));
checkCudaErrors(cuMemFree(d_data));
checkCudaErrors(cuArrayDestroy(cu_array));
free(image_path);
free(ref_path);
checkCudaErrors(cuCtxDestroy(cuContext));
exit(bTestResults ? EXIT_SUCCESS : EXIT_FAILURE);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -293,45 +307,44 @@ void runTest(int argc, char **argv) {
//! kernel function. After the module is loaded, cuModuleGetFunction //! kernel function. After the module is loaded, cuModuleGetFunction
//! retrieves the CUDA function pointer "cuFunction" //! retrieves the CUDA function pointer "cuFunction"
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
static CUresult initCUDA(int argc, char **argv, CUfunction *transform) { static CUresult initCUDA(int argc, char **argv, CUfunction *transform)
CUfunction cuFunction = 0; {
int major = 0, minor = 0, devID = 0; CUfunction cuFunction = 0;
char deviceName[100]; int major = 0, minor = 0, devID = 0;
string module_path; char deviceName[100];
string module_path;
cuDevice = findCudaDeviceDRV(argc, (const char **)argv); cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// get compute capabilities and the devicename // get compute capabilities and the devicename
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)); printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuDeviceGetName(deviceName, sizeof(deviceName), cuDevice));
printf("> GPU Device has SM %d.%d compute capability\n", major, minor);
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module_path before we try to load the results // first search for the module_path before we try to load the results
std::ostringstream fatbin; std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); else {
} printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) { if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n"); printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// Create module from binary file (FATBIN) // Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
checkCudaErrors( checkCudaErrors(cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
cuModuleGetFunction(&cuFunction, cuModule, "transformKernel"));
*transform = cuFunction; *transform = cuFunction;
return CUDA_SUCCESS; return CUDA_SUCCESS;
} }

View File

@ -33,23 +33,22 @@
//! Transform an image using texture lookups //! Transform an image using texture lookups
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
extern "C" __global__ void transformKernel(float *g_odata, int width, extern "C" __global__ void transformKernel(float *g_odata, int width, int height, float theta, CUtexObject tex)
int height, float theta, {
CUtexObject tex) { // calculate normalized texture coordinates
// calculate normalized texture coordinates unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = (float)x - (float)width / 2; float u = (float)x - (float)width / 2;
float v = (float)y - (float)height / 2; float v = (float)y - (float)height / 2;
float tu = u * cosf(theta) - v * sinf(theta); float tu = u * cosf(theta) - v * sinf(theta);
float tv = v * cosf(theta) + u * sinf(theta); float tv = v * cosf(theta) + u * sinf(theta);
tu /= (float)width; tu /= (float)width;
tv /= (float)height; tv /= (float)height;
// read from texture and write to global memory // read from texture and write to global memory
g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f); g_odata[y * width + x] = tex2D<float>(tex, tu + 0.5f, tv + 0.5f);
} }
#endif // #ifndef _SIMPLETEXTURE_KERNEL_H_ #endif // #ifndef _SIMPLETEXTURE_KERNEL_H_

View File

@ -53,257 +53,237 @@ static const char *sSDKsample = "[simpleVoteIntrinsics]\0";
#include "simpleVote_kernel.cuh" #include "simpleVote_kernel.cuh"
// Generate the test pattern for Tests 1 and 2 // Generate the test pattern for Tests 1 and 2
void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size) { void genVoteTestPattern(unsigned int *VOTE_PATTERN, int size)
// For testing VOTE.Any (all of these threads will return 0) {
for (int i = 0; i < size / 4; i++) { // For testing VOTE.Any (all of these threads will return 0)
VOTE_PATTERN[i] = 0x00000000; for (int i = 0; i < size / 4; i++) {
} VOTE_PATTERN[i] = 0x00000000;
// For testing VOTE.Any (1/2 these threads will return 1)
for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
}
// For testing VOTE.all (1/2 of these threads will return 0)
for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
}
// For testing VOTE.all (all of these threads will return 1)
for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
VOTE_PATTERN[i] = 0xffffffff;
}
}
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size,
const char *voteType) {
int i, sum = 0;
for (sum = 0, i = start; i < end; i++) {
sum += h_result[i];
}
if (sum > 0) {
printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
for (i = start; i < end; i++) {
printf("%d", h_result[i]);
} }
printf("%d values FAILED\n", sum); // For testing VOTE.Any (1/2 these threads will return 1)
} for (int i = 2 * size / 8; i < 4 * size / 8; i++) {
VOTE_PATTERN[i] = (i & 0x01) ? i : 0;
return (sum > 0);
}
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size,
const char *voteType) {
int i, sum = 0;
for (sum = 0, i = start; i < end; i++) {
sum += h_result[i];
}
if (sum != warp_size) {
printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
for (i = start; i < end; i++) {
printf("%d", h_result[i]);
} }
printf(" - FAILED\n"); // For testing VOTE.all (1/2 of these threads will return 0)
} for (int i = 2 * size / 4; i < 3 * size / 4; i++) {
VOTE_PATTERN[i] = (i & 0x01) ? 0 : i;
}
return (sum != warp_size); // For testing VOTE.all (all of these threads will return 1)
for (int i = 3 * size / 4; i < 4 * size / 4; i++) {
VOTE_PATTERN[i] = 0xffffffff;
}
}
int checkErrors1(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
{
int i, sum = 0;
for (sum = 0, i = start; i < end; i++) {
sum += h_result[i];
}
if (sum > 0) {
printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
for (i = start; i < end; i++) {
printf("%d", h_result[i]);
}
printf("%d values FAILED\n", sum);
}
return (sum > 0);
}
int checkErrors2(unsigned int *h_result, int start, int end, int warp_size, const char *voteType)
{
int i, sum = 0;
for (sum = 0, i = start; i < end; i++) {
sum += h_result[i];
}
if (sum != warp_size) {
printf("\t<%s>[%d - %d] = ", voteType, start, end - 1);
for (i = start; i < end; i++) {
printf("%d", h_result[i]);
}
printf(" - FAILED\n");
}
return (sum != warp_size);
} }
// Verification code for Kernel #1 // Verification code for Kernel #1
int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int checkResultsVoteAnyKernel1(unsigned int *h_result, int size, int warp_size)
int warp_size) { {
int error_count = 0; int error_count = 0;
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
warp_size, "Vote.Any"); error_count += checkErrors2(
error_count += h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
checkErrors2(h_result, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors2(
2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any"); h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
error_count += error_count += checkErrors2(
checkErrors2(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
error_count +=
checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.Any");
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
return error_count; return error_count;
} }
// Verification code for Kernel #2 // Verification code for Kernel #2
int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int checkResultsVoteAllKernel2(unsigned int *h_result, int size, int warp_size)
int warp_size) { {
int error_count = 0; int error_count = 0;
error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors1(h_result, 0, VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
warp_size, "Vote.All"); error_count += checkErrors1(
error_count += h_result, VOTE_DATA_GROUP * warp_size / 4, 2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
checkErrors1(h_result, VOTE_DATA_GROUP * warp_size / 4, error_count += checkErrors1(
2 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All"); h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, 3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
error_count += error_count += checkErrors2(
checkErrors1(h_result, 2 * VOTE_DATA_GROUP * warp_size / 4, h_result, 3 * VOTE_DATA_GROUP * warp_size / 4, 4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
3 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
error_count +=
checkErrors2(h_result, 3 * VOTE_DATA_GROUP * warp_size / 4,
4 * VOTE_DATA_GROUP * warp_size / 4, warp_size, "Vote.All");
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
return error_count; return error_count;
} }
// Verification code for Kernel #3 // Verification code for Kernel #3
int checkResultsVoteAnyKernel3(bool *hinfo, int size) { int checkResultsVoteAnyKernel3(bool *hinfo, int size)
int i, error_count = 0; {
int i, error_count = 0;
for (i = 0; i < size * 3; i++) { for (i = 0; i < size * 3; i++) {
switch (i % 3) { switch (i % 3) {
case 0: case 0:
// First warp should be all zeros. // First warp should be all zeros.
if (hinfo[i] != (i >= size * 1)) { if (hinfo[i] != (i >= size * 1)) {
error_count++; error_count++;
}
break;
case 1:
// First warp and half of second should be all zeros.
if (hinfo[i] != (i >= size * 3 / 2)) {
error_count++;
}
break;
case 2:
// First two warps should be all zeros.
if (hinfo[i] != (i >= size * 2)) {
error_count++;
}
break;
} }
break;
case 1:
// First warp and half of second should be all zeros.
if (hinfo[i] != (i >= size * 3 / 2)) {
error_count++;
}
break;
case 2:
// First two warps should be all zeros.
if (hinfo[i] != (i >= size * 2)) {
error_count++;
}
break;
} }
}
printf((error_count == 0) ? "\tOK\n" : "\tERROR\n"); printf((error_count == 0) ? "\tOK\n" : "\tERROR\n");
return error_count; return error_count;
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
unsigned int *h_input, *h_result; {
unsigned int *d_input, *d_result; unsigned int *h_input, *h_result;
unsigned int *d_input, *d_result;
bool *dinfo = NULL, *hinfo = NULL; bool *dinfo = NULL, *hinfo = NULL;
int error_count[3] = {0, 0, 0}; int error_count[3] = {0, 0, 0};
cudaDeviceProp deviceProp; cudaDeviceProp deviceProp;
int devID, warp_size = 32; int devID, warp_size = 32;
printf("%s\n", sSDKsample); printf("%s\n", sSDKsample);
// This will pick the best possible CUDA capable device // This will pick the best possible CUDA capable device
devID = findCudaDevice(argc, (const char **)argv); devID = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
// Statistics about the GPU device // Statistics about the GPU device
printf( printf("> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n",
"> GPU device has %d Multi-Processors, SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount,
deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); deviceProp.major,
deviceProp.minor);
h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * h_input = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
sizeof(unsigned int)); h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * sizeof(unsigned int));
h_result = (unsigned int *)malloc(VOTE_DATA_GROUP * warp_size * checkCudaErrors(
sizeof(unsigned int)); cudaMalloc(reinterpret_cast<void **>(&d_input), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
checkCudaErrors( checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&d_input), cudaMalloc(reinterpret_cast<void **>(&d_result), VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int))); genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
checkCudaErrors( checkCudaErrors(
cudaMalloc(reinterpret_cast<void **>(&d_result), cudaMemcpy(d_input, h_input, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyHostToDevice));
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int)));
genVoteTestPattern(h_input, VOTE_DATA_GROUP * warp_size);
checkCudaErrors(cudaMemcpy(d_input, h_input,
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int),
cudaMemcpyHostToDevice));
// Start of Vote Any Test Kernel #1 // Start of Vote Any Test Kernel #1
printf("[VOTE Kernel Test 1/3]\n"); printf("[VOTE Kernel Test 1/3]\n");
printf("\tRunning <<Vote.Any>> kernel1 ...\n"); printf("\tRunning <<Vote.Any>> kernel1 ...\n");
{ {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
dim3 gridBlock(1, 1); dim3 gridBlock(1, 1);
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VoteAnyKernel1<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
VOTE_DATA_GROUP * warp_size); getLastCudaError("VoteAnyKernel() execution failed\n");
getLastCudaError("VoteAnyKernel() execution failed\n"); checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize()); }
} checkCudaErrors(
checkCudaErrors(cudaMemcpy(h_result, d_result, cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), error_count[0] += checkResultsVoteAnyKernel1(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
cudaMemcpyDeviceToHost));
error_count[0] += checkResultsVoteAnyKernel1(
h_result, VOTE_DATA_GROUP * warp_size, warp_size);
// Start of Vote All Test Kernel #2 // Start of Vote All Test Kernel #2
printf("\n[VOTE Kernel Test 2/3]\n"); printf("\n[VOTE Kernel Test 2/3]\n");
printf("\tRunning <<Vote.All>> kernel2 ...\n"); printf("\tRunning <<Vote.All>> kernel2 ...\n");
{ {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
dim3 gridBlock(1, 1); dim3 gridBlock(1, 1);
dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1); dim3 threadBlock(VOTE_DATA_GROUP * warp_size, 1);
VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VoteAllKernel2<<<gridBlock, threadBlock>>>(d_input, d_result, VOTE_DATA_GROUP * warp_size);
VOTE_DATA_GROUP * warp_size); getLastCudaError("VoteAllKernel() execution failed\n");
getLastCudaError("VoteAllKernel() execution failed\n"); checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaDeviceSynchronize()); }
} checkCudaErrors(
checkCudaErrors(cudaMemcpy(h_result, d_result, cudaMemcpy(h_result, d_result, VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), cudaMemcpyDeviceToHost));
VOTE_DATA_GROUP * warp_size * sizeof(unsigned int), error_count[1] += checkResultsVoteAllKernel2(h_result, VOTE_DATA_GROUP * warp_size, warp_size);
cudaMemcpyDeviceToHost));
error_count[1] += checkResultsVoteAllKernel2(
h_result, VOTE_DATA_GROUP * warp_size, warp_size);
// Second Vote Kernel Test #3 (both Any/All) // Second Vote Kernel Test #3 (both Any/All)
hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool))); hinfo = reinterpret_cast<bool *>(calloc(warp_size * 3 * 3, sizeof(bool)));
cudaMalloc(reinterpret_cast<void **>(&dinfo), cudaMalloc(reinterpret_cast<void **>(&dinfo), warp_size * 3 * 3 * sizeof(bool));
warp_size * 3 * 3 * sizeof(bool)); cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyHostToDevice);
cudaMemcpy(dinfo, hinfo, warp_size * 3 * 3 * sizeof(bool),
cudaMemcpyHostToDevice);
printf("\n[VOTE Kernel Test 3/3]\n"); printf("\n[VOTE Kernel Test 3/3]\n");
printf("\tRunning <<Vote.Any>> kernel3 ...\n"); printf("\tRunning <<Vote.Any>> kernel3 ...\n");
{ {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size); VoteAnyKernel3<<<1, warp_size * 3>>>(dinfo, warp_size);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
} }
cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpy(hinfo, dinfo, warp_size * 3 * 3 * sizeof(bool), cudaMemcpyDeviceToHost);
cudaMemcpyDeviceToHost);
error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3); error_count[2] = checkResultsVoteAnyKernel3(hinfo, warp_size * 3);
// Now free these resources for Test #1,2 // Now free these resources for Test #1,2
checkCudaErrors(cudaFree(d_input)); checkCudaErrors(cudaFree(d_input));
checkCudaErrors(cudaFree(d_result)); checkCudaErrors(cudaFree(d_result));
free(h_input); free(h_input);
free(h_result); free(h_result);
// Free resources from Test #3 // Free resources from Test #3
free(hinfo); free(hinfo);
cudaFree(dinfo); cudaFree(dinfo);
printf("\tShutting down...\n"); printf("\tShutting down...\n");
return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) return (error_count[0] == 0 && error_count[1] == 0 && error_count[2] == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
? EXIT_SUCCESS
: EXIT_FAILURE;
} }

View File

@ -38,43 +38,44 @@
// If ANY one of the threads (within the warp) of the predicated condition // If ANY one of the threads (within the warp) of the predicated condition
// returns a non-zero value, then all threads within this warp will return a // returns a non-zero value, then all threads within this warp will return a
// non-zero value // non-zero value
__global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, __global__ void VoteAnyKernel1(unsigned int *input, unsigned int *result, int size)
int size) { {
int tx = threadIdx.x; int tx = threadIdx.x;
int mask = 0xffffffff; int mask = 0xffffffff;
result[tx] = __any_sync(mask, input[tx]); result[tx] = __any_sync(mask, input[tx]);
} }
// Kernel #2 tests the across-the-warp vote(all) intrinsic. // Kernel #2 tests the across-the-warp vote(all) intrinsic.
// If ALL of the threads (within the warp) of the predicated condition returns // If ALL of the threads (within the warp) of the predicated condition returns
// a non-zero value, then all threads within this warp will return a non-zero // a non-zero value, then all threads within this warp will return a non-zero
// value // value
__global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, __global__ void VoteAllKernel2(unsigned int *input, unsigned int *result, int size)
int size) { {
int tx = threadIdx.x; int tx = threadIdx.x;
int mask = 0xffffffff; int mask = 0xffffffff;
result[tx] = __all_sync(mask, input[tx]); result[tx] = __all_sync(mask, input[tx]);
} }
// Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic. // Kernel #3 is a directed test for the across-the-warp vote(all) intrinsic.
// This kernel will test for conditions across warps, and within half warps // This kernel will test for conditions across warps, and within half warps
__global__ void VoteAnyKernel3(bool *info, int warp_size) { __global__ void VoteAnyKernel3(bool *info, int warp_size)
int tx = threadIdx.x; {
unsigned int mask = 0xffffffff; int tx = threadIdx.x;
bool *offs = info + (tx * 3); unsigned int mask = 0xffffffff;
bool *offs = info + (tx * 3);
// The following should hold true for the second and third warp // The following should hold true for the second and third warp
*offs = __any_sync(mask, (tx >= (warp_size * 3) / 2)); *offs = __any_sync(mask, (tx >= (warp_size * 3) / 2));
// The following should hold true for the "upper half" of the second warp, // The following should hold true for the "upper half" of the second warp,
// and all of the third warp // and all of the third warp
*(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false); *(offs + 1) = (tx >= (warp_size * 3) / 2 ? true : false);
// The following should hold true for the third warp only // The following should hold true for the third warp only
if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) { if (__all_sync(mask, (tx >= (warp_size * 3) / 2))) {
*(offs + 2) = true; *(offs + 2) = true;
} }
} }
#endif #endif

View File

@ -41,12 +41,13 @@
#endif #endif
/* Add two vectors on the GPU */ /* Add two vectors on the GPU */
__global__ void vectorAddGPU(float *a, float *b, float *c, int N) { __global__ void vectorAddGPU(float *a, float *b, float *c, int N)
int idx = blockIdx.x * blockDim.x + threadIdx.x; {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N) { if (idx < N) {
c[idx] = a[idx] + b[idx]; c[idx] = a[idx] + b[idx];
} }
} }
// Allocate generic memory with malloc() and pin it laster instead of using // Allocate generic memory with malloc() and pin it laster instead of using
@ -54,194 +55,196 @@ __global__ void vectorAddGPU(float *a, float *b, float *c, int N) {
bool bPinGenericMemory = false; bool bPinGenericMemory = false;
// Macro to aligned up to the memory size in question // Macro to aligned up to the memory size in question
#define MEMORY_ALIGNMENT 4096 #define MEMORY_ALIGNMENT 4096
#define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1))) #define ALIGN_UP(x, size) (((size_t)x + (size - 1)) & (~(size - 1)))
int main(int argc, char **argv) { int main(int argc, char **argv)
int n, nelem, deviceCount; {
int idev = 0; // use default device 0 int n, nelem, deviceCount;
char *device = NULL; int idev = 0; // use default device 0
unsigned int flags; char *device = NULL;
size_t bytes; unsigned int flags;
float *a, *b, *c; // Pinned memory allocated on the CPU size_t bytes;
float *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU float *a, *b, *c; // Pinned memory allocated on the CPU
float *d_a, *d_b, *d_c; // Device pointers for mapped memory float *a_UA, *b_UA, *c_UA; // Non-4K Aligned Pinned memory on the CPU
float errorNorm, refNorm, ref, diff; float *d_a, *d_b, *d_c; // Device pointers for mapped memory
cudaDeviceProp deviceProp; float errorNorm, refNorm, ref, diff;
cudaDeviceProp deviceProp;
if (checkCmdLineFlag(argc, (const char **)argv, "help")) { if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
printf("Usage: simpleZeroCopy [OPTION]\n\n"); printf("Usage: simpleZeroCopy [OPTION]\n\n");
printf("Options:\n"); printf("Options:\n");
printf(" --device=[device #] Specify the device to be used\n"); printf(" --device=[device #] Specify the device to be used\n");
printf( printf(" --use_generic_memory (optional) use generic page-aligned for system "
" --use_generic_memory (optional) use generic page-aligned for system " "memory\n");
"memory\n"); return EXIT_SUCCESS;
return EXIT_SUCCESS;
}
/* Get the device selected by the user or default to 0, and then set it. */
if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
cudaGetDeviceCount(&deviceCount);
idev = atoi(device);
if (idev >= deviceCount || idev < 0) {
fprintf(stderr,
"Device number %d is invalid, will use default CUDA device 0.\n",
idev);
idev = 0;
} }
}
// if GPU found supports SM 1.2, then continue, otherwise we exit /* Get the device selected by the user or default to 0, and then set it. */
if (!checkCudaCapabilities(1, 2)) { if (getCmdLineArgumentString(argc, (const char **)argv, "device", &device)) {
exit(EXIT_SUCCESS); cudaGetDeviceCount(&deviceCount);
} idev = atoi(device);
if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) { if (idev >= deviceCount || idev < 0) {
fprintf(stderr, "Device number %d is invalid, will use default CUDA device 0.\n", idev);
idev = 0;
}
}
// if GPU found supports SM 1.2, then continue, otherwise we exit
if (!checkCudaCapabilities(1, 2)) {
exit(EXIT_SUCCESS);
}
if (checkCmdLineFlag(argc, (const char **)argv, "use_generic_memory")) {
#if defined(__APPLE__) || defined(MACOSX) #if defined(__APPLE__) || defined(MACOSX)
bPinGenericMemory = false; // Generic Pinning of System Paged memory is not bPinGenericMemory = false; // Generic Pinning of System Paged memory is not
// currently supported on Mac OSX // currently supported on Mac OSX
#else #else
bPinGenericMemory = true; bPinGenericMemory = true;
#endif #endif
} }
if (bPinGenericMemory) { if (bPinGenericMemory) {
printf("> Using Generic System Paged Memory (malloc)\n"); printf("> Using Generic System Paged Memory (malloc)\n");
} else { }
printf("> Using CUDA Host Allocated (cudaHostAlloc)\n"); else {
} printf("> Using CUDA Host Allocated (cudaHostAlloc)\n");
}
checkCudaErrors(cudaSetDevice(idev)); checkCudaErrors(cudaSetDevice(idev));
/* Verify the selected device supports mapped memory and set the device /* Verify the selected device supports mapped memory and set the device
flags for mapping host memory. */ flags for mapping host memory. */
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev)); checkCudaErrors(cudaGetDeviceProperties(&deviceProp, idev));
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
if (!deviceProp.canMapHostMemory) { if (!deviceProp.canMapHostMemory) {
fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", fprintf(stderr, "Device %d does not support mapping CPU host memory!\n", idev);
idev);
exit(EXIT_SUCCESS);
}
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
fprintf(stderr,
"CUDART version %d.%d does not support "
"<cudaDeviceProp.canMapHostMemory> field\n",
,
CUDART_VERSION / 1000,
(CUDART_VERSION % 100) / 10);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
}
checkCudaErrors(cudaSetDeviceFlags(cudaDeviceMapHost));
#else
fprintf(stderr,
"CUDART version %d.%d does not support "
"<cudaDeviceProp.canMapHostMemory> field\n",
, CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
exit(EXIT_SUCCESS);
#endif #endif
#if CUDART_VERSION < 4000 #if CUDART_VERSION < 4000
if (bPinGenericMemory) { if (bPinGenericMemory) {
fprintf( fprintf(stderr,
stderr, "CUDART version %d.%d does not support <cudaHostRegister> function\n",
"CUDART version %d.%d does not support <cudaHostRegister> function\n", CUDART_VERSION / 1000,
CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); (CUDART_VERSION % 100) / 10);
exit(EXIT_SUCCESS); exit(EXIT_SUCCESS);
} }
#endif #endif
/* Allocate mapped CPU memory. */ /* Allocate mapped CPU memory. */
nelem = 1048576; nelem = 1048576;
bytes = nelem * sizeof(float); bytes = nelem * sizeof(float);
if (bPinGenericMemory) { if (bPinGenericMemory) {
#if CUDART_VERSION >= 4000 #if CUDART_VERSION >= 4000
a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); a_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); b_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT); c_UA = (float *)malloc(bytes + MEMORY_ALIGNMENT);
// We need to ensure memory is aligned to 4K (so we will need to padd memory // We need to ensure memory is aligned to 4K (so we will need to padd memory
// accordingly) // accordingly)
a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT); a = (float *)ALIGN_UP(a_UA, MEMORY_ALIGNMENT);
b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT); b = (float *)ALIGN_UP(b_UA, MEMORY_ALIGNMENT);
c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT); c = (float *)ALIGN_UP(c_UA, MEMORY_ALIGNMENT);
checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(a, bytes, cudaHostRegisterMapped));
checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(b, bytes, cudaHostRegisterMapped));
checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped)); checkCudaErrors(cudaHostRegister(c, bytes, cudaHostRegisterMapped));
#endif #endif
} else { }
else {
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
flags = cudaHostAllocMapped; flags = cudaHostAllocMapped;
checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags)); checkCudaErrors(cudaHostAlloc((void **)&a, bytes, flags));
checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags)); checkCudaErrors(cudaHostAlloc((void **)&b, bytes, flags));
checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags)); checkCudaErrors(cudaHostAlloc((void **)&c, bytes, flags));
#endif #endif
} }
/* Initialize the vectors. */ /* Initialize the vectors. */
for (n = 0; n < nelem; n++) { for (n = 0; n < nelem; n++) {
a[n] = rand() / (float)RAND_MAX; a[n] = rand() / (float)RAND_MAX;
b[n] = rand() / (float)RAND_MAX; b[n] = rand() / (float)RAND_MAX;
} }
/* Get the device pointers for the pinned CPU memory mapped into the GPU /* Get the device pointers for the pinned CPU memory mapped into the GPU
memory space. */ memory space. */
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0)); checkCudaErrors(cudaHostGetDevicePointer((void **)&d_a, (void *)a, 0));
checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0)); checkCudaErrors(cudaHostGetDevicePointer((void **)&d_b, (void *)b, 0));
checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0)); checkCudaErrors(cudaHostGetDevicePointer((void **)&d_c, (void *)c, 0));
#endif #endif
/* Call the GPU kernel using the CPU pointers residing in CPU mapped memory. /* Call the GPU kernel using the CPU pointers residing in CPU mapped memory.
*/ */
printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n"); printf("> vectorAddGPU kernel will add vectors using mapped CPU memory...\n");
dim3 block(256); dim3 block(256);
dim3 grid((unsigned int)ceil(nelem / (float)block.x)); dim3 grid((unsigned int)ceil(nelem / (float)block.x));
vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem); vectorAddGPU<<<grid, block>>>(d_a, d_b, d_c, nelem);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("vectorAddGPU() execution failed"); getLastCudaError("vectorAddGPU() execution failed");
/* Compare the results */ /* Compare the results */
printf("> Checking the results from vectorAddGPU() ...\n"); printf("> Checking the results from vectorAddGPU() ...\n");
errorNorm = 0.f; errorNorm = 0.f;
refNorm = 0.f; refNorm = 0.f;
for (n = 0; n < nelem; n++) { for (n = 0; n < nelem; n++) {
ref = a[n] + b[n]; ref = a[n] + b[n];
diff = c[n] - ref; diff = c[n] - ref;
errorNorm += diff * diff; errorNorm += diff * diff;
refNorm += ref * ref; refNorm += ref * ref;
} }
errorNorm = (float)sqrt((double)errorNorm); errorNorm = (float)sqrt((double)errorNorm);
refNorm = (float)sqrt((double)refNorm); refNorm = (float)sqrt((double)refNorm);
/* Memory clean up */ /* Memory clean up */
printf("> Releasing CPU memory...\n"); printf("> Releasing CPU memory...\n");
if (bPinGenericMemory) { if (bPinGenericMemory) {
#if CUDART_VERSION >= 4000 #if CUDART_VERSION >= 4000
checkCudaErrors(cudaHostUnregister(a)); checkCudaErrors(cudaHostUnregister(a));
checkCudaErrors(cudaHostUnregister(b)); checkCudaErrors(cudaHostUnregister(b));
checkCudaErrors(cudaHostUnregister(c)); checkCudaErrors(cudaHostUnregister(c));
free(a_UA); free(a_UA);
free(b_UA); free(b_UA);
free(c_UA); free(c_UA);
#endif #endif
} else { }
else {
#if CUDART_VERSION >= 2020 #if CUDART_VERSION >= 2020
checkCudaErrors(cudaFreeHost(a)); checkCudaErrors(cudaFreeHost(a));
checkCudaErrors(cudaFreeHost(b)); checkCudaErrors(cudaFreeHost(b));
checkCudaErrors(cudaFreeHost(c)); checkCudaErrors(cudaFreeHost(c));
#endif #endif
} }
exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE); exit(errorNorm / refNorm < 1.e-6f ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -34,4 +34,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

View File

@ -29,113 +29,111 @@
* memory. * memory.
*/ */
#include <cstdio>
#include <ctime>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <math.h> #include <math.h>
#include <stdint.h> #include <stdint.h>
#include <cstdio>
#include <ctime>
#define min(a, b) (a) < (b) ? (a) : (b) #define min(a, b) (a) < (b) ? (a) : (b)
#define max(a, b) (a) > (b) ? (a) : (b) #define max(a, b) (a) > (b) ? (a) : (b)
#define LOOP_NUM 50 #define LOOP_NUM 50
__global__ void atomicKernel(int *atom_arr) { __global__ void atomicKernel(int *atom_arr)
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x; {
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = 0; i < LOOP_NUM; i++) { for (int i = 0; i < LOOP_NUM; i++) {
// Atomic addition // Atomic addition
atomicAdd_system(&atom_arr[0], 10); atomicAdd_system(&atom_arr[0], 10);
// Atomic exchange // Atomic exchange
atomicExch_system(&atom_arr[1], tid); atomicExch_system(&atom_arr[1], tid);
// Atomic maximum // Atomic maximum
atomicMax_system(&atom_arr[2], tid); atomicMax_system(&atom_arr[2], tid);
// Atomic minimum // Atomic minimum
atomicMin_system(&atom_arr[3], tid); atomicMin_system(&atom_arr[3], tid);
// Atomic increment (modulo 17+1) // Atomic increment (modulo 17+1)
atomicInc_system((unsigned int *)&atom_arr[4], 17); atomicInc_system((unsigned int *)&atom_arr[4], 17);
// Atomic decrement // Atomic decrement
atomicDec_system((unsigned int *)&atom_arr[5], 137); atomicDec_system((unsigned int *)&atom_arr[5], 137);
// Atomic compare-and-swap // Atomic compare-and-swap
atomicCAS_system(&atom_arr[6], tid - 1, tid); atomicCAS_system(&atom_arr[6], tid - 1, tid);
// Bitwise atomic instructions // Bitwise atomic instructions
// Atomic AND // Atomic AND
atomicAnd_system(&atom_arr[7], 2 * tid + 7); atomicAnd_system(&atom_arr[7], 2 * tid + 7);
// Atomic OR // Atomic OR
atomicOr_system(&atom_arr[8], 1 << tid); atomicOr_system(&atom_arr[8], 1 << tid);
// Atomic XOR // Atomic XOR
atomicXor_system(&atom_arr[9], tid); atomicXor_system(&atom_arr[9], tid);
} }
} }
void atomicKernel_CPU(int *atom_arr, int no_of_threads) { void atomicKernel_CPU(int *atom_arr, int no_of_threads)
for (int i = no_of_threads; i < 2 * no_of_threads; i++) { {
for (int j = 0; j < LOOP_NUM; j++) { for (int i = no_of_threads; i < 2 * no_of_threads; i++) {
// Atomic addition for (int j = 0; j < LOOP_NUM; j++) {
__sync_fetch_and_add(&atom_arr[0], 10); // Atomic addition
__sync_fetch_and_add(&atom_arr[0], 10);
// Atomic exchange // Atomic exchange
__sync_lock_test_and_set(&atom_arr[1], i); __sync_lock_test_and_set(&atom_arr[1], i);
// Atomic maximum // Atomic maximum
int old, expected; int old, expected;
do { do {
expected = atom_arr[2]; expected = atom_arr[2];
old = __sync_val_compare_and_swap(&atom_arr[2], expected, old = __sync_val_compare_and_swap(&atom_arr[2], expected, max(expected, i));
max(expected, i)); } while (old != expected);
} while (old != expected);
// Atomic minimum // Atomic minimum
do { do {
expected = atom_arr[3]; expected = atom_arr[3];
old = __sync_val_compare_and_swap(&atom_arr[3], expected, old = __sync_val_compare_and_swap(&atom_arr[3], expected, min(expected, i));
min(expected, i)); } while (old != expected);
} while (old != expected);
// Atomic increment (modulo 17+1) // Atomic increment (modulo 17+1)
int limit = 17; int limit = 17;
do { do {
expected = atom_arr[4]; expected = atom_arr[4];
old = __sync_val_compare_and_swap( old = __sync_val_compare_and_swap(&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1);
&atom_arr[4], expected, (expected >= limit) ? 0 : expected + 1); } while (old != expected);
} while (old != expected);
// Atomic decrement // Atomic decrement
limit = 137; limit = 137;
do { do {
expected = atom_arr[5]; expected = atom_arr[5];
old = __sync_val_compare_and_swap( old = __sync_val_compare_and_swap(
&atom_arr[5], expected, &atom_arr[5], expected, ((expected == 0) || (expected > limit)) ? limit : expected - 1);
((expected == 0) || (expected > limit)) ? limit : expected - 1); } while (old != expected);
} while (old != expected);
// Atomic compare-and-swap // Atomic compare-and-swap
__sync_val_compare_and_swap(&atom_arr[6], i - 1, i); __sync_val_compare_and_swap(&atom_arr[6], i - 1, i);
// Bitwise atomic instructions // Bitwise atomic instructions
// Atomic AND // Atomic AND
__sync_fetch_and_and(&atom_arr[7], 2 * i + 7); __sync_fetch_and_and(&atom_arr[7], 2 * i + 7);
// Atomic OR // Atomic OR
__sync_fetch_and_or(&atom_arr[8], 1 << i); __sync_fetch_and_or(&atom_arr[8], 1 << i);
// Atomic XOR // Atomic XOR
// 11th element should be 0xff // 11th element should be 0xff
__sync_fetch_and_xor(&atom_arr[9], i); __sync_fetch_and_xor(&atom_arr[9], i);
}
} }
}
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -145,198 +143,201 @@ void atomicKernel_CPU(int *atom_arr, int no_of_threads) {
//! @param idata input data as provided to device //! @param idata input data as provided to device
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int verify(int *testData, const int len) { int verify(int *testData, const int len)
int val = 0; {
int val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) { for (int i = 0; i < len * LOOP_NUM; ++i) {
val += 10; val += 10;
}
if (val != testData[0]) {
printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
return false;
}
val = 0;
bool found = false;
for (int i = 0; i < len; ++i) {
// second element should be a member of [0, len)
if (i == testData[1]) {
found = true;
break;
} }
}
if (!found) { if (val != testData[0]) {
printf("atomicExch failed\n"); printf("atomicAdd failed val = %d testData = %d\n", val, testData[0]);
return false; return false;
}
val = -(1 << 8);
for (int i = 0; i < len; ++i) {
// third element should be len-1
val = max(val, i);
}
if (val != testData[2]) {
printf("atomicMax failed\n");
return false;
}
val = 1 << 8;
for (int i = 0; i < len; ++i) {
val = min(val, i);
}
if (val != testData[3]) {
printf("atomicMin failed\n");
return false;
}
int limit = 17;
val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != testData[4]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != testData[5]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// seventh element should be a member of [0, len)
if (i == testData[6]) {
found = true;
break;
} }
}
if (!found) { val = 0;
printf("atomicCAS failed\n");
return false;
}
val = 0xff; bool found = false;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
// 8th element should be 1 // second element should be a member of [0, len)
val &= (2 * i + 7); if (i == testData[1]) {
} found = true;
break;
}
}
if (val != testData[7]) { if (!found) {
printf("atomicAnd failed\n"); printf("atomicExch failed\n");
return false; return false;
} }
val = 0; val = -(1 << 8);
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
// 9th element should be 0xff // third element should be len-1
val |= (1 << i); val = max(val, i);
} }
if (val != testData[8]) { if (val != testData[2]) {
printf("atomicOr failed\n"); printf("atomicMax failed\n");
return false; return false;
} }
val = 0xff; val = 1 << 8;
for (int i = 0; i < len; ++i) { for (int i = 0; i < len; ++i) {
// 11th element should be 0xff val = min(val, i);
val ^= i; }
}
if (val != testData[9]) { if (val != testData[3]) {
printf("atomicXor failed\n"); printf("atomicMin failed\n");
return false; return false;
} }
return true; int limit = 17;
val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val = (val >= limit) ? 0 : val + 1;
}
if (val != testData[4]) {
printf("atomicInc failed\n");
return false;
}
limit = 137;
val = 0;
for (int i = 0; i < len * LOOP_NUM; ++i) {
val = ((val == 0) || (val > limit)) ? limit : val - 1;
}
if (val != testData[5]) {
printf("atomicDec failed\n");
return false;
}
found = false;
for (int i = 0; i < len; ++i) {
// seventh element should be a member of [0, len)
if (i == testData[6]) {
found = true;
break;
}
}
if (!found) {
printf("atomicCAS failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 8th element should be 1
val &= (2 * i + 7);
}
if (val != testData[7]) {
printf("atomicAnd failed\n");
return false;
}
val = 0;
for (int i = 0; i < len; ++i) {
// 9th element should be 0xff
val |= (1 << i);
}
if (val != testData[8]) {
printf("atomicOr failed\n");
return false;
}
val = 0xff;
for (int i = 0; i < len; ++i) {
// 11th element should be 0xff
val ^= i;
}
if (val != testData[9]) {
printf("atomicXor failed\n");
return false;
}
return true;
} }
int main(int argc, char **argv) { int main(int argc, char **argv)
// set device {
cudaDeviceProp device_prop; // set device
int dev_id = findCudaDevice(argc, (const char **)argv); cudaDeviceProp device_prop;
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id)); int dev_id = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&device_prop, dev_id));
if (!device_prop.managedMemory) { if (!device_prop.managedMemory) {
// This samples requires being run on a device that supports Unified Memory // This samples requires being run on a device that supports Unified Memory
fprintf(stderr, "Unified Memory not supported on this device\n"); fprintf(stderr, "Unified Memory not supported on this device\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
if (device_prop.computeMode == cudaComputeModeProhibited) { if (device_prop.computeMode == cudaComputeModeProhibited) {
// This sample requires being run with a default or process exclusive mode // This sample requires being run with a default or process exclusive mode
fprintf(stderr, fprintf(stderr,
"This sample requires a device in either default or process " "This sample requires a device in either default or process "
"exclusive mode\n"); "exclusive mode\n");
exit(EXIT_WAIVED); exit(EXIT_WAIVED);
} }
if (device_prop.major < 6) { if (device_prop.major < 6) {
printf( printf("%s: requires a minimum CUDA compute 6.0 capability, waiving "
"%s: requires a minimum CUDA compute 6.0 capability, waiving " "testing.\n",
"testing.\n", argv[0]);
argv[0]); exit(EXIT_WAIVED);
exit(EXIT_WAIVED); }
}
unsigned int numThreads = 256; unsigned int numThreads = 256;
unsigned int numBlocks = 64; unsigned int numBlocks = 64;
unsigned int numData = 10; unsigned int numData = 10;
int *atom_arr; int *atom_arr;
if (device_prop.pageableMemoryAccess) { if (device_prop.pageableMemoryAccess) {
printf("CAN access pageable memory\n"); printf("CAN access pageable memory\n");
atom_arr = (int *)malloc(sizeof(int) * numData); atom_arr = (int *)malloc(sizeof(int) * numData);
} else { }
printf("CANNOT access pageable memory\n"); else {
checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData)); printf("CANNOT access pageable memory\n");
} checkCudaErrors(cudaMallocManaged(&atom_arr, sizeof(int) * numData));
}
for (unsigned int i = 0; i < numData; i++) atom_arr[i] = 0; for (unsigned int i = 0; i < numData; i++)
atom_arr[i] = 0;
// To make the AND and XOR tests generate something other than 0... // To make the AND and XOR tests generate something other than 0...
atom_arr[7] = atom_arr[9] = 0xff; atom_arr[7] = atom_arr[9] = 0xff;
atomicKernel<<<numBlocks, numThreads>>>(atom_arr); atomicKernel<<<numBlocks, numThreads>>>(atom_arr);
atomicKernel_CPU(atom_arr, numBlocks * numThreads); atomicKernel_CPU(atom_arr, numBlocks * numThreads);
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
// Compute & verify reference solution // Compute & verify reference solution
int testResult = verify(atom_arr, 2 * numThreads * numBlocks); int testResult = verify(atom_arr, 2 * numThreads * numBlocks);
if (device_prop.pageableMemoryAccess) { if (device_prop.pageableMemoryAccess) {
free(atom_arr); free(atom_arr);
} else { }
cudaFree(atom_arr); else {
} cudaFree(atom_arr);
}
printf("systemWideAtomics completed, returned %s \n", printf("systemWideAtomics completed, returned %s \n", testResult ? "OK" : "ERROR!");
testResult ? "OK" : "ERROR!"); exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -31,10 +31,10 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h> #include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// includes CUDA // includes CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
@ -47,34 +47,34 @@
// declaration, forward // declaration, forward
void runTest(int argc, char **argv); void runTest(int argc, char **argv);
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Simple test kernel for device functionality //! Simple test kernel for device functionality
//! @param g_idata input data in global memory //! @param g_idata input data in global memory
//! @param g_odata output data in global memory //! @param g_odata output data in global memory
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
__global__ void testKernel(float *g_idata, float *g_odata) { __global__ void testKernel(float *g_idata, float *g_odata)
// shared memory {
// the size is determined by the host application // shared memory
extern __shared__ float sdata[]; // the size is determined by the host application
extern __shared__ float sdata[];
// access thread id // access thread id
const unsigned int tid = threadIdx.x; const unsigned int tid = threadIdx.x;
// access number of threads in this block // access number of threads in this block
const unsigned int num_threads = blockDim.x; const unsigned int num_threads = blockDim.x;
// read in input data from global memory // read in input data from global memory
sdata[tid] = g_idata[tid]; sdata[tid] = g_idata[tid];
__syncthreads(); __syncthreads();
// perform some computations // perform some computations
sdata[tid] = (float)num_threads * sdata[tid]; sdata[tid] = (float)num_threads * sdata[tid];
__syncthreads(); __syncthreads();
// write data to global memory // write data to global memory
g_odata[tid] = sdata[tid]; g_odata[tid] = sdata[tid];
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -85,81 +85,81 @@ int main(int argc, char **argv) { runTest(argc, argv); }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA //! Run a simple test for CUDA
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv) { void runTest(int argc, char **argv)
bool bTestResult = true; {
bool bTestResult = true;
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
// use command-line specified CUDA device, otherwise use device with highest // use command-line specified CUDA device, otherwise use device with highest
// Gflops/s // Gflops/s
int devID = findCudaDevice(argc, (const char **)argv); int devID = findCudaDevice(argc, (const char **)argv);
StopWatchInterface *timer = 0; StopWatchInterface *timer = 0;
sdkCreateTimer(&timer); sdkCreateTimer(&timer);
sdkStartTimer(&timer); sdkStartTimer(&timer);
unsigned int num_threads = 32; unsigned int num_threads = 32;
unsigned int mem_size = sizeof(float) * num_threads; unsigned int mem_size = sizeof(float) * num_threads;
// allocate host memory // allocate host memory
float *h_idata = (float *)malloc(mem_size); float *h_idata = (float *)malloc(mem_size);
// initalize the memory // initalize the memory
for (unsigned int i = 0; i < num_threads; ++i) { for (unsigned int i = 0; i < num_threads; ++i) {
h_idata[i] = (float)i; h_idata[i] = (float)i;
} }
// allocate device memory // allocate device memory
float *d_idata; float *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size)); checkCudaErrors(cudaMalloc((void **)&d_idata, mem_size));
// copy host memory to device // copy host memory to device
checkCudaErrors( checkCudaErrors(cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice));
// allocate device memory for result // allocate device memory for result
float *d_odata; float *d_odata;
checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size)); checkCudaErrors(cudaMalloc((void **)&d_odata, mem_size));
// setup execution parameters // setup execution parameters
dim3 grid(1, 1, 1); dim3 grid(1, 1, 1);
dim3 threads(num_threads, 1, 1); dim3 threads(num_threads, 1, 1);
// execute the kernel // execute the kernel
testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata); testKernel<<<grid, threads, mem_size>>>(d_idata, d_odata);
// check if kernel execution generated and error // check if kernel execution generated and error
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
// allocate mem for the result on host side // allocate mem for the result on host side
float *h_odata = (float *)malloc(mem_size); float *h_odata = (float *)malloc(mem_size);
// copy result from device to host // copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, checkCudaErrors(cudaMemcpy(h_odata, d_odata, sizeof(float) * num_threads, cudaMemcpyDeviceToHost));
cudaMemcpyDeviceToHost));
sdkStopTimer(&timer); sdkStopTimer(&timer);
printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
sdkDeleteTimer(&timer); sdkDeleteTimer(&timer);
// compute reference solution // compute reference solution
float *reference = (float *)malloc(mem_size); float *reference = (float *)malloc(mem_size);
computeGold(reference, h_idata, num_threads); computeGold(reference, h_idata, num_threads);
// check result // check result
if (checkCmdLineFlag(argc, (const char **)argv, "regression")) { if (checkCmdLineFlag(argc, (const char **)argv, "regression")) {
// write file for regression test // write file for regression test
sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false); sdkWriteFile("./data/regression.dat", h_odata, num_threads, 0.0f, false);
} else { }
// custom output handling when no regression test running else {
// in this case check if the result is equivalent to the expected solution // custom output handling when no regression test running
bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f); // in this case check if the result is equivalent to the expected solution
} bTestResult = compareData(reference, h_odata, num_threads, 0.0f, 0.0f);
}
// cleanup memory // cleanup memory
free(h_idata); free(h_idata);
free(h_odata); free(h_odata);
free(reference); free(reference);
checkCudaErrors(cudaFree(d_idata)); checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata)); checkCudaErrors(cudaFree(d_odata));
exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
} }

View File

@ -26,8 +26,7 @@
*/ */
// export C interface // export C interface
extern "C" void computeGold(float *reference, float *idata, extern "C" void computeGold(float *reference, float *idata, const unsigned int len);
const unsigned int len);
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
//! Compute reference data set //! Compute reference data set
@ -36,10 +35,11 @@ extern "C" void computeGold(float *reference, float *idata,
//! @param idata input data as provided to device //! @param idata input data as provided to device
//! @param len number of elements in reference / idata //! @param len number of elements in reference / idata
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void computeGold(float *reference, float *idata, const unsigned int len) { void computeGold(float *reference, float *idata, const unsigned int len)
const float f_len = static_cast<float>(len); {
const float f_len = static_cast<float>(len);
for (unsigned int i = 0; i < len; ++i) { for (unsigned int i = 0; i < len; ++i) {
reference[i] = idata[i] * f_len; reference[i] = idata[i] * f_len;
} }
} }

View File

@ -37,7 +37,6 @@
// For the CUDA runtime routines (prefixed with "cuda_") // For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
/** /**
* CUDA Kernel Device code * CUDA Kernel Device code
@ -45,166 +44,153 @@
* Computes the vector addition of A and B into C. The 3 vectors have the same * Computes the vector addition of A and B into C. The 3 vectors have the same
* number of elements numElements. * number of elements numElements.
*/ */
__global__ void vectorAdd(const float *A, const float *B, float *C, __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
int numElements) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) { if (i < numElements) {
C[i] = A[i] + B[i] + 0.0f; C[i] = A[i] + B[i] + 0.0f;
} }
} }
/** /**
* Host main routine * Host main routine
*/ */
int main(void) { int main(void)
// Error code to check return values for CUDA calls {
cudaError_t err = cudaSuccess; // Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;
// Print the vector length to be used, and compute its size // Print the vector length to be used, and compute its size
int numElements = 50000; int numElements = 50000;
size_t size = numElements * sizeof(float); size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements); printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A // Allocate the host input vector A
float *h_A = (float *)malloc(size); float *h_A = (float *)malloc(size);
// Allocate the host input vector B // Allocate the host input vector B
float *h_B = (float *)malloc(size); float *h_B = (float *)malloc(size);
// Allocate the host output vector C // Allocate the host output vector C
float *h_C = (float *)malloc(size); float *h_C = (float *)malloc(size);
// Verify that allocations succeeded // Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) { if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n"); fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Allocate the device input vector A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device input vector B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Allocate the device output vector C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the host input vectors A and B in host memory to the device input
// vectors in
// device memory
printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr,
"Failed to copy vector A from host to device (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr,
"Failed to copy vector B from host to device (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr,
"Failed to copy vector C from device to host (error code %s)!\n",
cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
} }
}
printf("Test PASSED\n"); // Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Free device global memory // Allocate the device input vector A
err = cudaFree(d_A); float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err)); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
err = cudaFree(d_B); // Allocate the device input vector B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err)); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
err = cudaFree(d_C); // Allocate the device output vector C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);
if (err != cudaSuccess) { if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err));
cudaGetErrorString(err)); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
// Free host memory // Copy the host input vectors A and B in host memory to the device input
free(h_A); // vectors in
free(h_B); // device memory
free(h_C); printf("Copy input data from the host memory to the CUDA device\n");
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
printf("Done\n"); if (err != cudaSuccess) {
return 0; fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
err = cudaFree(d_A);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_B);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
err = cudaFree(d_C);
if (err != cudaSuccess) {
fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err));
exit(EXIT_FAILURE);
}
// Free host memory
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
} }

View File

@ -34,11 +34,11 @@
*/ */
// Includes // Includes
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring> #include <cstring>
#include <cuda.h> #include <cuda.h>
#include <iostream>
#include <stdio.h>
#include <string.h>
// includes, project // includes, project
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
@ -50,19 +50,19 @@
using namespace std; using namespace std;
// Variables // Variables
CUdevice cuDevice; CUdevice cuDevice;
CUcontext cuContext; CUcontext cuContext;
CUmodule cuModule; CUmodule cuModule;
CUfunction vecAdd_kernel; CUfunction vecAdd_kernel;
float *h_A; float *h_A;
float *h_B; float *h_B;
float *h_C; float *h_C;
CUdeviceptr d_A; CUdeviceptr d_A;
CUdeviceptr d_B; CUdeviceptr d_B;
CUdeviceptr d_C; CUdeviceptr d_C;
// Functions // Functions
int CleanupNoFailure(); int CleanupNoFailure();
void RandomInit(float *, int); void RandomInit(float *, int);
bool findModulePath(const char *, string &, char **, string &); bool findModulePath(const char *, string &, char **, string &);
@ -72,150 +72,152 @@ bool findModulePath(const char *, string &, char **, string &);
#endif #endif
// Host code // Host code
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("Vector Addition (Driver API)\n"); {
int N = 50000, devID = 0; printf("Vector Addition (Driver API)\n");
size_t size = N * sizeof(float); int N = 50000, devID = 0;
size_t size = N * sizeof(float);
// Initialize // Initialize
checkCudaErrors(cuInit(0)); checkCudaErrors(cuInit(0));
cuDevice = findCudaDeviceDRV(argc, (const char **)argv); cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Create context // Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results // first search for the module path before we load the results
string module_path; string module_path;
std::ostringstream fatbin; std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) { if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} else { }
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); else {
} printf("> initCUDA loading module: <%s>\n", module_path.c_str());
}
if (!fatbin.str().size()) { if (!fatbin.str().size()) {
printf("fatbin file empty. exiting..\n"); printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// Create module from binary file (FATBIN) // Create module from binary file (FATBIN)
checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str())); checkCudaErrors(cuModuleLoadData(&cuModule, fatbin.str().c_str()));
// Get function handle from module // Get function handle from module
checkCudaErrors( checkCudaErrors(cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
cuModuleGetFunction(&vecAdd_kernel, cuModule, "VecAdd_kernel"));
// Allocate input vectors h_A and h_B in host memory // Allocate input vectors h_A and h_B in host memory
h_A = (float *)malloc(size); h_A = (float *)malloc(size);
h_B = (float *)malloc(size); h_B = (float *)malloc(size);
h_C = (float *)malloc(size); h_C = (float *)malloc(size);
// Initialize input vectors // Initialize input vectors
RandomInit(h_A, N); RandomInit(h_A, N);
RandomInit(h_B, N); RandomInit(h_B, N);
// Allocate vectors in device memory // Allocate vectors in device memory
checkCudaErrors(cuMemAlloc(&d_A, size)); checkCudaErrors(cuMemAlloc(&d_A, size));
checkCudaErrors(cuMemAlloc(&d_B, size)); checkCudaErrors(cuMemAlloc(&d_B, size));
checkCudaErrors(cuMemAlloc(&d_C, size)); checkCudaErrors(cuMemAlloc(&d_C, size));
// Copy vectors from host memory to device memory // Copy vectors from host memory to device memory
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size)); checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size)); checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
if (1) { if (1) {
// This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
// Launch (simpler method) // Launch (simpler method)
// Grid/Block configuration // Grid/Block configuration
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
void *args[] = {&d_A, &d_B, &d_C, &N}; void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
threadsPerBlock, 1, 1, 0, NULL, args, NULL)); }
} else { else {
// This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel
// Launch (advanced method) // Launch (advanced method)
int offset = 0; int offset = 0;
void *argBuffer[16]; void *argBuffer[16];
*((CUdeviceptr *)&argBuffer[offset]) = d_A; *((CUdeviceptr *)&argBuffer[offset]) = d_A;
offset += sizeof(d_A); offset += sizeof(d_A);
*((CUdeviceptr *)&argBuffer[offset]) = d_B; *((CUdeviceptr *)&argBuffer[offset]) = d_B;
offset += sizeof(d_B); offset += sizeof(d_B);
*((CUdeviceptr *)&argBuffer[offset]) = d_C; *((CUdeviceptr *)&argBuffer[offset]) = d_C;
offset += sizeof(d_C); offset += sizeof(d_C);
*((int *)&argBuffer[offset]) = N; *((int *)&argBuffer[offset]) = N;
offset += sizeof(N); offset += sizeof(N);
// Grid/Block configuration // Grid/Block configuration
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaErrors(
threadsPerBlock, 1, 1, 0, NULL, NULL, cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, NULL, argBuffer));
argBuffer)); }
}
#ifdef _DEBUG #ifdef _DEBUG
checkCudaErrors(cuCtxSynchronize()); checkCudaErrors(cuCtxSynchronize());
#endif #endif
// Copy result from device memory to host memory // Copy result from device memory to host memory
// h_C contains the result in host memory // h_C contains the result in host memory
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size)); checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify result // Verify result
int i; int i;
for (i = 0; i < N; ++i) { for (i = 0; i < N; ++i) {
float sum = h_A[i] + h_B[i]; float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f) { if (fabs(h_C[i] - sum) > 1e-7f) {
break; break;
}
} }
}
CleanupNoFailure(); CleanupNoFailure();
printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL"); printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE); exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int CleanupNoFailure() { int CleanupNoFailure()
// Free device memory {
checkCudaErrors(cuMemFree(d_A)); // Free device memory
checkCudaErrors(cuMemFree(d_B)); checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_C)); checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Free host memory // Free host memory
if (h_A) { if (h_A) {
free(h_A); free(h_A);
} }
if (h_B) { if (h_B) {
free(h_B); free(h_B);
} }
if (h_C) { if (h_C) {
free(h_C); free(h_C);
} }
checkCudaErrors(cuCtxDestroy(cuContext)); checkCudaErrors(cuCtxDestroy(cuContext));
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }
// Allocates an array with random float entries. // Allocates an array with random float entries.
void RandomInit(float *data, int n) { void RandomInit(float *data, int n)
for (int i = 0; i < n; ++i) { {
data[i] = rand() / (float)RAND_MAX; for (int i = 0; i < n; ++i) {
} data[i] = rand() / (float)RAND_MAX;
}
} }

View File

@ -33,9 +33,10 @@
*/ */
// Device code // Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
float *C, int N) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i]; if (i < N)
C[i] = A[i] + B[i];
} }

View File

@ -30,4 +30,3 @@ cuMemcpyDtoH, cuDeviceCanAccessPeer, cuModuleGetFunction, cuMemSetAccess, cuMemR
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details) ## References (for more details)

View File

@ -29,172 +29,172 @@
static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; } static size_t round_up(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
CUresult simpleMallocMultiDeviceMmap( CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr,
CUdeviceptr *dptr, size_t *allocationSize, size_t size, size_t *allocationSize,
const std::vector<CUdevice> &residentDevices, size_t size,
const std::vector<CUdevice> &mappingDevices, size_t align) { const std::vector<CUdevice> &residentDevices,
CUresult status = CUDA_SUCCESS; const std::vector<CUdevice> &mappingDevices,
size_t min_granularity = 0; size_t align)
size_t stripeSize; {
CUresult status = CUDA_SUCCESS;
size_t min_granularity = 0;
size_t stripeSize;
// Setup the properties common for all the chunks // Setup the properties common for all the chunks
// The allocations will be device pinned memory. // The allocations will be device pinned memory.
// This property structure describes the physical location where the memory // This property structure describes the physical location where the memory
// will be allocated via cuMemCreate allong with additional properties In this // will be allocated via cuMemCreate allong with additional properties In this
// case, the allocation will be pinnded device memory local to a given device. // case, the allocation will be pinnded device memory local to a given device.
CUmemAllocationProp prop = {}; CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
// Get the minimum granularity needed for the resident devices // Get the minimum granularity needed for the resident devices
// (the max of the minimum granularity of each participating device) // (the max of the minimum granularity of each participating device)
for (int idx = 0; idx < residentDevices.size(); idx++) { for (int idx = 0; idx < residentDevices.size(); idx++) {
size_t granularity = 0; size_t granularity = 0;
// get the minnimum granularity for residentDevices[idx] // get the minnimum granularity for residentDevices[idx]
prop.location.id = residentDevices[idx]; prop.location.id = residentDevices[idx];
status = cuMemGetAllocationGranularity(&granularity, &prop, status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
CU_MEM_ALLOC_GRANULARITY_MINIMUM); if (status != CUDA_SUCCESS) {
if (status != CUDA_SUCCESS) { goto done;
goto done; }
} if (min_granularity < granularity) {
if (min_granularity < granularity) { min_granularity = granularity;
min_granularity = granularity; }
}
}
// Get the minimum granularity needed for the accessing devices
// (the max of the minimum granularity of each participating device)
for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
size_t granularity = 0;
// get the minnimum granularity for mappingDevices[idx]
prop.location.id = mappingDevices[idx];
status = cuMemGetAllocationGranularity(&granularity, &prop,
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (status != CUDA_SUCCESS) {
goto done;
}
if (min_granularity < granularity) {
min_granularity = granularity;
}
}
// Round up the size such that we can evenly split it into a stripe size tha
// meets the granularity requirements Essentially size = N *
// residentDevices.size() * min_granularity is the requirement, since each
// piece of the allocation will be stripeSize = N * min_granularity and the
// min_granularity requirement applies to each stripeSize piece of the
// allocation.
size = round_up(size, residentDevices.size() * min_granularity);
stripeSize = size / residentDevices.size();
// Return the rounded up size to the caller for use in the free
if (allocationSize) {
*allocationSize = size;
}
// Reserve the required contiguous VA space for the allocations
status = cuMemAddressReserve(dptr, size, align, 0, 0);
if (status != CUDA_SUCCESS) {
goto done;
}
// Create and map the backings on each gpu
// note: reusing CUmemAllocationProp prop from earlier with prop.type &
// prop.location.type already specified.
for (size_t idx = 0; idx < residentDevices.size(); idx++) {
CUresult status2 = CUDA_SUCCESS;
// Set the location for this chunk to this device
prop.location.id = residentDevices[idx];
// Create the allocation as a pinned allocation on this device
CUmemGenericAllocationHandle allocationHandle;
status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
if (status != CUDA_SUCCESS) {
goto done;
} }
// Assign the chunk to the appropriate VA range and release the handle. // Get the minimum granularity needed for the accessing devices
// After mapping the memory, it can be referenced by virtual address. // (the max of the minimum granularity of each participating device)
// Since we do not need to make any other mappings of this memory or export
// it, we no longer need and can release the allocationHandle. The
// allocation will be kept live until it is unmapped.
status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0,
allocationHandle, 0);
// the handle needs to be released even if the mapping failed.
status2 = cuMemRelease(allocationHandle);
if (status == CUDA_SUCCESS) {
// cuMemRelease should not have failed here
// as the handle was just allocated successfully
// however return an error if it does.
status = status2;
}
// Cleanup in case of any mapping failures.
if (status != CUDA_SUCCESS) {
goto done;
}
}
{
// Each accessDescriptor will describe the mapping requirement for a single
// device
std::vector<CUmemAccessDesc> accessDescriptors;
accessDescriptors.resize(mappingDevices.size());
// Prepare the access descriptor array indicating where and how the backings
// should be visible.
for (size_t idx = 0; idx < mappingDevices.size(); idx++) { for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
// Specify which device we are adding mappings for. size_t granularity = 0;
accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDescriptors[idx].location.id = mappingDevices[idx];
// Specify both read and write access. // get the minnimum granularity for mappingDevices[idx]
accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; prop.location.id = mappingDevices[idx];
status = cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
if (status != CUDA_SUCCESS) {
goto done;
}
if (min_granularity < granularity) {
min_granularity = granularity;
}
} }
// Apply the access descriptors to the whole VA range. // Round up the size such that we can evenly split it into a stripe size tha
status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], // meets the granularity requirements Essentially size = N *
accessDescriptors.size()); // residentDevices.size() * min_granularity is the requirement, since each
// piece of the allocation will be stripeSize = N * min_granularity and the
// min_granularity requirement applies to each stripeSize piece of the
// allocation.
size = round_up(size, residentDevices.size() * min_granularity);
stripeSize = size / residentDevices.size();
// Return the rounded up size to the caller for use in the free
if (allocationSize) {
*allocationSize = size;
}
// Reserve the required contiguous VA space for the allocations
status = cuMemAddressReserve(dptr, size, align, 0, 0);
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
goto done; goto done;
}
// Create and map the backings on each gpu
// note: reusing CUmemAllocationProp prop from earlier with prop.type &
// prop.location.type already specified.
for (size_t idx = 0; idx < residentDevices.size(); idx++) {
CUresult status2 = CUDA_SUCCESS;
// Set the location for this chunk to this device
prop.location.id = residentDevices[idx];
// Create the allocation as a pinned allocation on this device
CUmemGenericAllocationHandle allocationHandle;
status = cuMemCreate(&allocationHandle, stripeSize, &prop, 0);
if (status != CUDA_SUCCESS) {
goto done;
}
// Assign the chunk to the appropriate VA range and release the handle.
// After mapping the memory, it can be referenced by virtual address.
// Since we do not need to make any other mappings of this memory or export
// it, we no longer need and can release the allocationHandle. The
// allocation will be kept live until it is unmapped.
status = cuMemMap(*dptr + (stripeSize * idx), stripeSize, 0, allocationHandle, 0);
// the handle needs to be released even if the mapping failed.
status2 = cuMemRelease(allocationHandle);
if (status == CUDA_SUCCESS) {
// cuMemRelease should not have failed here
// as the handle was just allocated successfully
// however return an error if it does.
status = status2;
}
// Cleanup in case of any mapping failures.
if (status != CUDA_SUCCESS) {
goto done;
}
}
{
// Each accessDescriptor will describe the mapping requirement for a single
// device
std::vector<CUmemAccessDesc> accessDescriptors;
accessDescriptors.resize(mappingDevices.size());
// Prepare the access descriptor array indicating where and how the backings
// should be visible.
for (size_t idx = 0; idx < mappingDevices.size(); idx++) {
// Specify which device we are adding mappings for.
accessDescriptors[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDescriptors[idx].location.id = mappingDevices[idx];
// Specify both read and write access.
accessDescriptors[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
}
// Apply the access descriptors to the whole VA range.
status = cuMemSetAccess(*dptr, size, &accessDescriptors[0], accessDescriptors.size());
if (status != CUDA_SUCCESS) {
goto done;
}
} }
}
done: done:
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
if (*dptr) { if (*dptr) {
simpleFreeMultiDeviceMmap(*dptr, size); simpleFreeMultiDeviceMmap(*dptr, size);
}
} }
}
return status; return status;
} }
CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size) { CUresult simpleFreeMultiDeviceMmap(CUdeviceptr dptr, size_t size)
CUresult status = CUDA_SUCCESS; {
CUresult status = CUDA_SUCCESS;
// Unmap the mapped virtual memory region // Unmap the mapped virtual memory region
// Since the handles to the mapped backing stores have already been released // Since the handles to the mapped backing stores have already been released
// by cuMemRelease, and these are the only/last mappings referencing them, // by cuMemRelease, and these are the only/last mappings referencing them,
// The backing stores will be freed. // The backing stores will be freed.
// Since the memory has been unmapped after this call, accessing the specified // Since the memory has been unmapped after this call, accessing the specified
// va range will result in a fault (unitll it is remapped). // va range will result in a fault (unitll it is remapped).
status = cuMemUnmap(dptr, size); status = cuMemUnmap(dptr, size);
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
return status; return status;
} }
// Free the virtual address region. This allows the virtual address region // Free the virtual address region. This allows the virtual address region
// to be reused by future cuMemAddressReserve calls. This also allows the // to be reused by future cuMemAddressReserve calls. This also allows the
// virtual address region to be used by other allocation made through // virtual address region to be used by other allocation made through
// opperating system calls like malloc & mmap. // opperating system calls like malloc & mmap.
status = cuMemAddressFree(dptr, size); status = cuMemAddressFree(dptr, size);
if (status != CUDA_SUCCESS) { if (status != CUDA_SUCCESS) {
return status; return status;
} }
return status; return status;
} }

View File

@ -63,10 +63,12 @@
//! handle //! handle
//! is not needed after its mappings are set up. //! is not needed after its mappings are set up.
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
CUresult simpleMallocMultiDeviceMmap( CUresult simpleMallocMultiDeviceMmap(CUdeviceptr *dptr,
CUdeviceptr *dptr, size_t *allocationSize, size_t size, size_t *allocationSize,
const std::vector<CUdevice> &residentDevices, size_t size,
const std::vector<CUdevice> &mappingDevices, size_t align = 0); const std::vector<CUdevice> &residentDevices,
const std::vector<CUdevice> &mappingDevices,
size_t align = 0);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
//! Frees resources allocated by simpleMallocMultiDeviceMmap //! Frees resources allocated by simpleMallocMultiDeviceMmap

View File

@ -36,11 +36,11 @@
*/ */
// Includes // Includes
#include <cstring>
#include <cuda.h> #include <cuda.h>
#include <iostream>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <cstring>
#include <iostream>
// includes, project // includes, project
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
@ -54,115 +54,111 @@
using namespace std; using namespace std;
// Variables // Variables
CUdevice cuDevice; CUdevice cuDevice;
CUcontext cuContext; CUcontext cuContext;
CUmodule cuModule; CUmodule cuModule;
CUfunction vecAdd_kernel; CUfunction vecAdd_kernel;
float *h_A; float *h_A;
float *h_B; float *h_B;
float *h_C; float *h_C;
CUdeviceptr d_A; CUdeviceptr d_A;
CUdeviceptr d_B; CUdeviceptr d_B;
CUdeviceptr d_C; CUdeviceptr d_C;
size_t allocationSize = 0; size_t allocationSize = 0;
// Functions // Functions
int CleanupNoFailure(); int CleanupNoFailure();
void RandomInit(float *, int); void RandomInit(float *, int);
//define input fatbin file // define input fatbin file
#ifndef FATBIN_FILE #ifndef FATBIN_FILE
#define FATBIN_FILE "vectorAdd_kernel64.fatbin" #define FATBIN_FILE "vectorAdd_kernel64.fatbin"
#endif #endif
// collect all of the devices whose memory can be mapped from cuDevice. // collect all of the devices whose memory can be mapped from cuDevice.
vector<CUdevice> getBackingDevices(CUdevice cuDevice) { vector<CUdevice> getBackingDevices(CUdevice cuDevice)
int num_devices; {
int num_devices;
checkCudaErrors(cuDeviceGetCount(&num_devices)); checkCudaErrors(cuDeviceGetCount(&num_devices));
vector<CUdevice> backingDevices; vector<CUdevice> backingDevices;
backingDevices.push_back(cuDevice); backingDevices.push_back(cuDevice);
for (int dev = 0; dev < num_devices; dev++) { for (int dev = 0; dev < num_devices; dev++) {
int capable = 0; int capable = 0;
int attributeVal = 0; int attributeVal = 0;
// The mapping device is already in the backingDevices vector // The mapping device is already in the backingDevices vector
if (dev == cuDevice) { if (dev == cuDevice) {
continue; continue;
}
// Only peer capable devices can map each others memory
checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
if (!capable) {
continue;
}
// The device needs to support virtual address management for the required
// apis to work
checkCudaErrors(
cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
if (attributeVal == 0) {
continue;
}
backingDevices.push_back(dev);
} }
return backingDevices;
// Only peer capable devices can map each others memory
checkCudaErrors(cuDeviceCanAccessPeer(&capable, cuDevice, dev));
if (!capable) {
continue;
}
// The device needs to support virtual address management for the required
// apis to work
checkCudaErrors(cuDeviceGetAttribute(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
cuDevice));
if (attributeVal == 0) {
continue;
}
backingDevices.push_back(dev);
}
return backingDevices;
} }
// Host code // Host code
int main(int argc, char **argv) { int main(int argc, char **argv)
printf("Vector Addition (Driver API)\n"); {
int N = 50000; printf("Vector Addition (Driver API)\n");
size_t size = N * sizeof(float); int N = 50000;
int attributeVal = 0; size_t size = N * sizeof(float);
int attributeVal = 0;
// Initialize // Initialize
checkCudaErrors(cuInit(0)); checkCudaErrors(cuInit(0));
cuDevice = findCudaDeviceDRV(argc, (const char **)argv); cuDevice = findCudaDeviceDRV(argc, (const char **)argv);
// Check that the selected device supports virtual address management // Check that the selected device supports virtual address management
checkCudaErrors(cuDeviceGetAttribute( checkCudaErrors(
&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDeviceGetAttribute(&attributeVal, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, cuDevice));
cuDevice)); printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, attributeVal);
printf("Device %d VIRTUAL ADDRESS MANAGEMENT SUPPORTED = %d.\n", cuDevice, if (attributeVal == 0) {
attributeVal); printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice);
if (attributeVal == 0) { exit(EXIT_WAIVED);
printf("Device %d doesn't support VIRTUAL ADDRESS MANAGEMENT.\n", cuDevice); }
exit(EXIT_WAIVED);
}
// The vector addition happens on cuDevice, so the allocations need to be // The vector addition happens on cuDevice, so the allocations need to be
// mapped there. // mapped there.
vector<CUdevice> mappingDevices; vector<CUdevice> mappingDevices;
mappingDevices.push_back(cuDevice); mappingDevices.push_back(cuDevice);
// Collect devices accessible by the mapping device (cuDevice) into the // Collect devices accessible by the mapping device (cuDevice) into the
// backingDevices vector. // backingDevices vector.
vector<CUdevice> backingDevices = getBackingDevices(cuDevice); vector<CUdevice> backingDevices = getBackingDevices(cuDevice);
// Create context // Create context
checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice)); checkCudaErrors(cuCtxCreate(&cuContext, 0, cuDevice));
// first search for the module path before we load the results // first search for the module path before we load the results
string module_path; string module_path;
std::ostringstream fatbin; std::ostringstream fatbin;
if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) if (!findFatbinPath(FATBIN_FILE, module_path, argv, fatbin)) {
{
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
else else {
{
printf("> initCUDA loading module: <%s>\n", module_path.c_str()); printf("> initCUDA loading module: <%s>\n", module_path.c_str());
} }
if (!fatbin.str().size()) if (!fatbin.str().size()) {
{
printf("fatbin file empty. exiting..\n"); printf("fatbin file empty. exiting..\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
@ -204,13 +200,10 @@ int main(int argc, char **argv) {
int threadsPerBlock = 256; int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
void *args[] = { &d_A, &d_B, &d_C, &N }; void *args[] = {&d_A, &d_B, &d_C, &N};
// Launch the CUDA kernel // Launch the CUDA kernel
checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, checkCudaErrors(cuLaunchKernel(vecAdd_kernel, blocksPerGrid, 1, 1, threadsPerBlock, 1, 1, 0, NULL, args, NULL));
threadsPerBlock, 1, 1,
0,
NULL, args, NULL));
// Copy result from device memory to host memory // Copy result from device memory to host memory
// h_C contains the result in host memory // h_C contains the result in host memory
@ -219,20 +212,18 @@ int main(int argc, char **argv) {
// Verify result // Verify result
int i; int i;
for (i = 0; i < N; ++i) for (i = 0; i < N; ++i) {
{
float sum = h_A[i] + h_B[i]; float sum = h_A[i] + h_B[i];
if (fabs(h_C[i] - sum) > 1e-7f) if (fabs(h_C[i] - sum) > 1e-7f) {
{
break; break;
} }
} }
CleanupNoFailure(); CleanupNoFailure();
printf("%s\n", (i==N) ? "Result = PASS" : "Result = FAIL"); printf("%s\n", (i == N) ? "Result = PASS" : "Result = FAIL");
exit((i==N) ? EXIT_SUCCESS : EXIT_FAILURE); exit((i == N) ? EXIT_SUCCESS : EXIT_FAILURE);
} }
int CleanupNoFailure() int CleanupNoFailure()
@ -243,18 +234,15 @@ int CleanupNoFailure()
checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize)); checkCudaErrors(simpleFreeMultiDeviceMmap(d_C, allocationSize));
// Free host memory // Free host memory
if (h_A) if (h_A) {
{
free(h_A); free(h_A);
} }
if (h_B) if (h_B) {
{
free(h_B); free(h_B);
} }
if (h_C) if (h_C) {
{
free(h_C); free(h_C);
} }
@ -265,8 +253,7 @@ int CleanupNoFailure()
// Allocates an array with random float entries. // Allocates an array with random float entries.
void RandomInit(float *data, int n) void RandomInit(float *data, int n)
{ {
for (int i = 0; i < n; ++i) for (int i = 0; i < n; ++i) {
{
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }
} }

View File

@ -34,9 +34,10 @@
*/ */
// Device code // Device code
extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, extern "C" __global__ void VecAdd_kernel(const float *A, const float *B, float *C, int N)
float *C, int N) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i]; if (i < N)
C[i] = A[i] + B[i];
} }

View File

@ -33,8 +33,8 @@
* of the programming guide with some additions like error checking. * of the programming guide with some additions like error checking.
*/ */
#include <stdio.h>
#include <cmath> #include <cmath>
#include <stdio.h>
// For the CUDA runtime routines (prefixed with "cuda_") // For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda.h> #include <cuda.h>
@ -42,112 +42,116 @@
// helper functions and utilities to work with CUDA // helper functions and utilities to work with CUDA
#include <helper_functions.h> #include <helper_functions.h>
#include <nvrtc_helper.h> #include <nvrtc_helper.h>
/** /**
* Host main routine * Host main routine
*/ */
int main(int argc, char **argv) { int main(int argc, char **argv)
char *cubin, *kernel_file; {
size_t cubinSize; char *cubin, *kernel_file;
kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]); size_t cubinSize;
compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0); kernel_file = sdkFindFilePath("vectorAdd_kernel.cu", argv[0]);
CUmodule module = loadCUBIN(cubin, argc, argv); compileFileToCUBIN(kernel_file, argc, argv, &cubin, &cubinSize, 0);
CUmodule module = loadCUBIN(cubin, argc, argv);
CUfunction kernel_addr; CUfunction kernel_addr;
checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd")); checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "vectorAdd"));
// Print the vector length to be used, and compute its size // Print the vector length to be used, and compute its size
int numElements = 50000; int numElements = 50000;
size_t size = numElements * sizeof(float); size_t size = numElements * sizeof(float);
printf("[Vector addition of %d elements]\n", numElements); printf("[Vector addition of %d elements]\n", numElements);
// Allocate the host input vector A // Allocate the host input vector A
float *h_A = reinterpret_cast<float *>(malloc(size)); float *h_A = reinterpret_cast<float *>(malloc(size));
// Allocate the host input vector B // Allocate the host input vector B
float *h_B = reinterpret_cast<float *>(malloc(size)); float *h_B = reinterpret_cast<float *>(malloc(size));
// Allocate the host output vector C // Allocate the host output vector C
float *h_C = reinterpret_cast<float *>(malloc(size)); float *h_C = reinterpret_cast<float *>(malloc(size));
// Verify that allocations succeeded // Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL) { if (h_A == NULL || h_B == NULL || h_C == NULL) {
fprintf(stderr, "Failed to allocate host vectors!\n"); fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
}
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
h_B[i] = rand() / static_cast<float>(RAND_MAX);
}
// Allocate the device input vector A
CUdeviceptr d_A;
checkCudaErrors(cuMemAlloc(&d_A, size));
// Allocate the device input vector B
CUdeviceptr d_B;
checkCudaErrors(cuMemAlloc(&d_B, size));
// Allocate the device output vector C
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, size));
// Copy the host input vectors A and B in host memory to the device input
// vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid,
threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A), reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr, cudaGridSize.x, cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x, cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0, 0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
} }
}
printf("Test PASSED\n"); // Initialize the host input vectors
for (int i = 0; i < numElements; ++i) {
h_A[i] = rand() / static_cast<float>(RAND_MAX);
h_B[i] = rand() / static_cast<float>(RAND_MAX);
}
// Free device global memory // Allocate the device input vector A
checkCudaErrors(cuMemFree(d_A)); CUdeviceptr d_A;
checkCudaErrors(cuMemFree(d_B)); checkCudaErrors(cuMemAlloc(&d_A, size));
checkCudaErrors(cuMemFree(d_C));
// Free host memory // Allocate the device input vector B
free(h_A); CUdeviceptr d_B;
free(h_B); checkCudaErrors(cuMemAlloc(&d_B, size));
free(h_C);
printf("Done\n"); // Allocate the device output vector C
CUdeviceptr d_C;
checkCudaErrors(cuMemAlloc(&d_C, size));
return 0; // Copy the host input vectors A and B in host memory to the device input
// vectors in device memory
printf("Copy input data from the host memory to the CUDA device\n");
checkCudaErrors(cuMemcpyHtoD(d_A, h_A, size));
checkCudaErrors(cuMemcpyHtoD(d_B, h_B, size));
// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
dim3 cudaBlockSize(threadsPerBlock, 1, 1);
dim3 cudaGridSize(blocksPerGrid, 1, 1);
void *arr[] = {reinterpret_cast<void *>(&d_A),
reinterpret_cast<void *>(&d_B),
reinterpret_cast<void *>(&d_C),
reinterpret_cast<void *>(&numElements)};
checkCudaErrors(cuLaunchKernel(kernel_addr,
cudaGridSize.x,
cudaGridSize.y,
cudaGridSize.z, /* grid dim */
cudaBlockSize.x,
cudaBlockSize.y,
cudaBlockSize.z, /* block dim */
0,
0, /* shared mem, stream */
&arr[0], /* arguments */
0));
checkCudaErrors(cuCtxSynchronize());
// Copy the device result vector in device memory to the host result vector
// in host memory.
printf("Copy output data from the CUDA device to the host memory\n");
checkCudaErrors(cuMemcpyDtoH(h_C, d_C, size));
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i) {
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
checkCudaErrors(cuMemFree(d_A));
checkCudaErrors(cuMemFree(d_B));
checkCudaErrors(cuMemFree(d_C));
// Free host memory
free(h_A);
free(h_B);
free(h_C);
printf("Done\n");
return 0;
} }

View File

@ -32,11 +32,11 @@
* number of elements numElements. * number of elements numElements.
*/ */
extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, extern "C" __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)
int numElements) { {
int i = blockDim.x * blockIdx.x + threadIdx.x; int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) { if (i < numElements) {
C[i] = A[i] + B[i]; C[i] = A[i] + B[i];
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -32,12 +32,11 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <helper_cuda.h> #include <helper_cuda.h>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <string> #include <string>
int *pArgc = NULL; int *pArgc = NULL;
char **pArgv = NULL; char **pArgv = NULL;
#if CUDART_VERSION < 5000 #if CUDART_VERSION < 5000
@ -46,19 +45,16 @@ char **pArgv = NULL;
#include <cuda.h> #include <cuda.h>
// This function wraps the CUDA Driver API into a template function // This function wraps the CUDA Driver API into a template function
template <class T> template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, {
int device) { CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);
if (CUDA_SUCCESS != error) { if (CUDA_SUCCESS != error) {
fprintf( fprintf(
stderr, stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n", error, __FILE__, __LINE__);
"cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
error, __FILE__, __LINE__);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif /* CUDART_VERSION < 5000 */ #endif /* CUDART_VERSION < 5000 */
@ -66,278 +62,259 @@ inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
pArgc = &argc; {
pArgv = argv; pArgc = &argc;
pArgv = argv;
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
printf( printf(" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
" CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");
int deviceCount = 0; int deviceCount = 0;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount); cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) { if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", printf("cudaGetDeviceCount returned %d\n-> %s\n", static_cast<int>(error_id), cudaGetErrorString(error_id));
static_cast<int>(error_id), cudaGetErrorString(error_id)); printf("Result = FAIL\n");
printf("Result = FAIL\n"); exit(EXIT_FAILURE);
exit(EXIT_FAILURE); }
}
// This function call returns 0 if there are no CUDA capable devices. // This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0) { if (deviceCount == 0) {
printf("There are no available device(s) that support CUDA\n"); printf("There are no available device(s) that support CUDA\n");
} else { }
printf("Detected %d CUDA Capable device(s)\n", deviceCount); else {
} printf("Detected %d CUDA Capable device(s)\n", deviceCount);
}
int dev, driverVersion = 0, runtimeVersion = 0; int dev, driverVersion = 0, runtimeVersion = 0;
for (dev = 0; dev < deviceCount; ++dev) { for (dev = 0; dev < deviceCount; ++dev) {
cudaSetDevice(dev); cudaSetDevice(dev);
cudaDeviceProp deviceProp; cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev); cudaGetDeviceProperties(&deviceProp, dev);
printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);
// Console log // Console log
cudaDriverGetVersion(&driverVersion); cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion); cudaRuntimeGetVersion(&runtimeVersion);
printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10, driverVersion / 1000,
runtimeVersion / 1000, (runtimeVersion % 100) / 10); (driverVersion % 100) / 10,
printf(" CUDA Capability Major/Minor version number: %d.%d\n", runtimeVersion / 1000,
deviceProp.major, deviceProp.minor); (runtimeVersion % 100) / 10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
char msg[256]; char msg[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(msg, sizeof(msg), sprintf_s(msg,
" Total amount of global memory: %.0f MBytes " sizeof(msg),
"(%llu bytes)\n", " Total amount of global memory: %.0f MBytes "
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f), "(%llu bytes)\n",
(unsigned long long)deviceProp.totalGlobalMem); static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
(unsigned long long)deviceProp.totalGlobalMem);
#else #else
snprintf(msg, sizeof(msg), snprintf(msg,
" Total amount of global memory: %.0f MBytes " sizeof(msg),
"(%llu bytes)\n", " Total amount of global memory: %.0f MBytes "
static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f), "(%llu bytes)\n",
(unsigned long long)deviceProp.totalGlobalMem); static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
(unsigned long long)deviceProp.totalGlobalMem);
#endif #endif
printf("%s", msg); printf("%s", msg);
printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n", printf(" (%03d) Multiprocessors, (%03d) CUDA Cores/MP: %d CUDA Cores\n",
deviceProp.multiProcessorCount, deviceProp.multiProcessorCount,
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor), _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
deviceProp.multiProcessorCount); printf(" GPU Max Clock rate: %.0f MHz (%0.2f "
printf( "GHz)\n",
" GPU Max Clock rate: %.0f MHz (%0.2f " deviceProp.clockRate * 1e-3f,
"GHz)\n", deviceProp.clockRate * 1e-6f);
deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
#if CUDART_VERSION >= 5000 #if CUDART_VERSION >= 5000
// This is supported in CUDA 5.0 (runtime API device properties) // This is supported in CUDA 5.0 (runtime API device properties)
printf(" Memory Clock rate: %.0f Mhz\n", printf(" Memory Clock rate: %.0f Mhz\n", deviceProp.memoryClockRate * 1e-3f);
deviceProp.memoryClockRate * 1e-3f); printf(" Memory Bus Width: %d-bit\n", deviceProp.memoryBusWidth);
printf(" Memory Bus Width: %d-bit\n",
deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize) { if (deviceProp.l2CacheSize) {
printf(" L2 Cache Size: %d bytes\n", printf(" L2 Cache Size: %d bytes\n", deviceProp.l2CacheSize);
deviceProp.l2CacheSize);
}
#else
// This only available in CUDA 4.0-4.2 (but these were only exposed in the
// CUDA Driver API)
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
dev);
printf(" Memory Clock rate: %.0f Mhz\n",
memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth,
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n",
memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize) {
printf(" L2 Cache Size: %d bytes\n",
L2CacheSize);
}
#endif
printf(
" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
"%d), 3D=(%d, %d, %d)\n",
deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
printf(
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
printf(
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n",
deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %zu bytes\n",
deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %zu bytes\n",
deviceProp.sharedMemPerBlock);
printf(" Total shared memory per multiprocessor: %zu bytes\n",
deviceProp.sharedMemPerMultiprocessor);
printf(" Total number of registers available per block: %d\n",
deviceProp.regsPerBlock);
printf(" Warp size: %d\n",
deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n",
deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n",
deviceProp.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %zu bytes\n",
deviceProp.memPitch);
printf(" Texture alignment: %zu bytes\n",
deviceProp.textureAlignment);
printf(
" Concurrent copy and kernel execution: %s with %d copy "
"engine(s)\n",
(deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
printf(" Run time limit on kernels: %s\n",
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n",
deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n",
deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n",
deviceProp.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support: %s\n",
deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
: "WDDM (Windows Display Driver Model)");
#endif
printf(" Device supports Unified Addressing (UVA): %s\n",
deviceProp.unifiedAddressing ? "Yes" : "No");
printf(" Device supports Managed Memory: %s\n",
deviceProp.managedMemory ? "Yes" : "No");
printf(" Device supports Compute Preemption: %s\n",
deviceProp.computePreemptionSupported ? "Yes" : "No");
printf(" Supports Cooperative Kernel Launch: %s\n",
deviceProp.cooperativeLaunch ? "Yes" : "No");
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);
const char *sComputeMode[] = {
"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this "
"device)",
"Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)",
"Unknown", NULL};
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
}
// If there are 2 or more GPUs, query to determine whether RDMA is supported
if (deviceCount >= 2) {
cudaDeviceProp prop[64];
int gpuid[64]; // we want to find the first two GPUs that can support P2P
int gpu_p2p_count = 0;
for (int i = 0; i < deviceCount; i++) {
checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
// Only boards based on Fermi or later can support P2P
if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// on Windows (64-bit), the Tesla Compute Cluster driver for windows
// must be enabled to support this
&& prop[i].tccDriver
#endif
) {
// This is an array of P2P capable GPUs
gpuid[gpu_p2p_count++] = i;
}
}
// Show all the combinations of support P2P GPUs
int can_access_peer;
if (gpu_p2p_count >= 2) {
for (int i = 0; i < gpu_p2p_count; i++) {
for (int j = 0; j < gpu_p2p_count; j++) {
if (gpuid[i] == gpuid[j]) {
continue;
}
checkCudaErrors(
cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
can_access_peer ? "Yes" : "No");
} }
}
#else
// This only available in CUDA 4.0-4.2 (but these were only exposed in the
// CUDA Driver API)
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize) {
printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
}
#endif
printf(" Maximum Texture Dimension Size (x,y,z) 1D=(%d), 2D=(%d, "
"%d), 3D=(%d, %d, %d)\n",
deviceProp.maxTexture1D,
deviceProp.maxTexture2D[0],
deviceProp.maxTexture2D[1],
deviceProp.maxTexture3D[0],
deviceProp.maxTexture3D[1],
deviceProp.maxTexture3D[2]);
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
deviceProp.maxTexture1DLayered[0],
deviceProp.maxTexture1DLayered[1]);
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n",
deviceProp.maxTexture2DLayered[0],
deviceProp.maxTexture2DLayered[1],
deviceProp.maxTexture2DLayered[2]);
printf(" Total amount of constant memory: %zu bytes\n", deviceProp.totalConstMem);
printf(" Total amount of shared memory per block: %zu bytes\n", deviceProp.sharedMemPerBlock);
printf(" Total shared memory per multiprocessor: %zu bytes\n", deviceProp.sharedMemPerMultiprocessor);
printf(" Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf(" Warp size: %d\n", deviceProp.warpSize);
printf(" Maximum number of threads per multiprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
printf(" Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
deviceProp.maxThreadsDim[0],
deviceProp.maxThreadsDim[1],
deviceProp.maxThreadsDim[2]);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
deviceProp.maxGridSize[0],
deviceProp.maxGridSize[1],
deviceProp.maxGridSize[2]);
printf(" Maximum memory pitch: %zu bytes\n", deviceProp.memPitch);
printf(" Texture alignment: %zu bytes\n", deviceProp.textureAlignment);
printf(" Concurrent copy and kernel execution: %s with %d copy "
"engine(s)\n",
(deviceProp.deviceOverlap ? "Yes" : "No"),
deviceProp.asyncEngineCount);
printf(" Run time limit on kernels: %s\n",
deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
printf(" Integrated GPU sharing Host Memory: %s\n", deviceProp.integrated ? "Yes" : "No");
printf(" Support host page-locked memory mapping: %s\n", deviceProp.canMapHostMemory ? "Yes" : "No");
printf(" Alignment requirement for Surfaces: %s\n", deviceProp.surfaceAlignment ? "Yes" : "No");
printf(" Device has ECC support: %s\n", deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
printf(" Device supports Unified Addressing (UVA): %s\n", deviceProp.unifiedAddressing ? "Yes" : "No");
printf(" Device supports Managed Memory: %s\n", deviceProp.managedMemory ? "Yes" : "No");
printf(" Device supports Compute Preemption: %s\n",
deviceProp.computePreemptionSupported ? "Yes" : "No");
printf(" Supports Cooperative Kernel Launch: %s\n", deviceProp.cooperativeLaunch ? "Yes" : "No");
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
deviceProp.pciDomainID,
deviceProp.pciBusID,
deviceProp.pciDeviceID);
const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this "
"device)",
"Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)",
"Unknown",
NULL};
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[deviceProp.computeMode]);
} }
}
// csv masterlog info // If there are 2 or more GPUs, query to determine whether RDMA is supported
// ***************************** if (deviceCount >= 2) {
// exe and CUDA driver name cudaDeviceProp prop[64];
printf("\n"); int gpuid[64]; // we want to find the first two GPUs that can support P2P
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART"; int gpu_p2p_count = 0;
char cTemp[16];
// driver version for (int i = 0; i < deviceCount; i++) {
sProfileString += ", CUDA Driver Version = "; checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));
// Only boards based on Fermi or later can support P2P
if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, // on Windows (64-bit), the Tesla Compute Cluster driver for windows
(driverVersion % 100) / 10); // must be enabled to support this
#else && prop[i].tccDriver
snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
(driverVersion % 100) / 10);
#endif #endif
sProfileString += cTemp; ) {
// This is an array of P2P capable GPUs
gpuid[gpu_p2p_count++] = i;
}
}
// Runtime version // Show all the combinations of support P2P GPUs
sProfileString += ", CUDA Runtime Version = "; int can_access_peer;
if (gpu_p2p_count >= 2) {
for (int i = 0; i < gpu_p2p_count; i++) {
for (int j = 0; j < gpu_p2p_count; j++) {
if (gpuid[i] == gpuid[j]) {
continue;
}
checkCudaErrors(cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
prop[gpuid[i]].name,
gpuid[i],
prop[gpuid[j]].name,
gpuid[j],
can_access_peer ? "Yes" : "No");
}
}
}
}
// csv masterlog info
// *****************************
// exe and CUDA driver name
printf("\n");
std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
char cTemp[16];
// driver version
sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
(runtimeVersion % 100) / 10);
#else #else
snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000, (driverVersion % 100) / 10);
(runtimeVersion % 100) / 10);
#endif #endif
sProfileString += cTemp; sProfileString += cTemp;
// Device count // Runtime version
sProfileString += ", NumDevs = "; sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d", deviceCount); sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
#else #else
snprintf(cTemp, sizeof(cTemp), "%d", deviceCount); snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000, (runtimeVersion % 100) / 10);
#endif #endif
sProfileString += cTemp; sProfileString += cTemp;
sProfileString += "\n";
printf("%s", sProfileString.c_str());
printf("Result = PASS\n"); // Device count
sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
sprintf_s(cTemp, 10, "%d", deviceCount);
#else
snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
#endif
sProfileString += cTemp;
sProfileString += "\n";
printf("%s", sProfileString.c_str());
// finish printf("Result = PASS\n");
exit(EXIT_SUCCESS);
// finish
exit(EXIT_SUCCESS);
} }

View File

@ -30,358 +30,295 @@
*/ */
// includes, system // includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cuda.h> #include <cuda.h>
#include <helper_cuda_drvapi.h> #include <helper_cuda_drvapi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
// Program main // Program main
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv) { int main(int argc, char **argv)
CUdevice dev; {
int major = 0, minor = 0; CUdevice dev;
int deviceCount = 0; int major = 0, minor = 0;
char deviceName[256]; int deviceCount = 0;
char deviceName[256];
printf("%s Starting...\n\n", argv[0]); printf("%s Starting...\n\n", argv[0]);
// note your project will need to link with cuda.lib files on windows // note your project will need to link with cuda.lib files on windows
printf("CUDA Device Query (Driver API) statically linked version \n"); printf("CUDA Device Query (Driver API) statically linked version \n");
checkCudaErrors(cuInit(0)); checkCudaErrors(cuInit(0));
checkCudaErrors(cuDeviceGetCount(&deviceCount)); checkCudaErrors(cuDeviceGetCount(&deviceCount));
// This function call returns 0 if there are no CUDA capable devices. // This function call returns 0 if there are no CUDA capable devices.
if (deviceCount == 0) { if (deviceCount == 0) {
printf("There are no available device(s) that support CUDA\n"); printf("There are no available device(s) that support CUDA\n");
} else { }
printf("Detected %d CUDA Capable device(s)\n", deviceCount); else {
} printf("Detected %d CUDA Capable device(s)\n", deviceCount);
for (dev = 0; dev < deviceCount; ++dev) {
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
printf("\nDevice %d: \"%s\"\n", dev, deviceName);
int driverVersion = 0;
checkCudaErrors(cuDriverGetVersion(&driverVersion));
printf(" CUDA Driver Version: %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10);
printf(" CUDA Capability Major/Minor version number: %d.%d\n", major,
minor);
size_t totalGlobalMem;
checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
char msg[256];
SPRINTF(msg,
" Total amount of global memory: %.0f MBytes "
"(%llu bytes)\n",
(float)totalGlobalMem / 1048576.0f,
(unsigned long long)totalGlobalMem);
printf("%s", msg);
int multiProcessorCount;
getCudaAttribute<int>(&multiProcessorCount,
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
int clockRate;
getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
printf(
" GPU Max Clock rate: %.0f MHz (%0.2f "
"GHz)\n",
clockRate * 1e-3f, clockRate * 1e-6f);
int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
dev);
printf(" Memory Clock rate: %.0f Mhz\n",
memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth,
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n",
memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
if (L2CacheSize) {
printf(" L2 Cache Size: %d bytes\n",
L2CacheSize);
} }
int maxTex1D, maxTex2D[2], maxTex3D[3]; for (dev = 0; dev < deviceCount; ++dev) {
getCudaAttribute<int>(&maxTex1D, checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev); checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
getCudaAttribute<int>(&maxTex2D[0],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[1],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[0],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
getCudaAttribute<int>(&maxTex3D[1],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[2],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
printf(
" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) "
"3D=(%d, %d, %d)\n",
maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
maxTex3D[2]);
int maxTex1DLayered[2]; checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));
getCudaAttribute<int>(&maxTex1DLayered[0],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
dev);
getCudaAttribute<int>(&maxTex1DLayered[1],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
dev);
printf(
" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
maxTex1DLayered[0], maxTex1DLayered[1]);
int maxTex2DLayered[3]; printf("\nDevice %d: \"%s\"\n", dev, deviceName);
getCudaAttribute<int>(&maxTex2DLayered[0],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
dev);
getCudaAttribute<int>(&maxTex2DLayered[1],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
dev);
getCudaAttribute<int>(&maxTex2DLayered[2],
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
dev);
printf(
" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n",
maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);
int totalConstantMemory; int driverVersion = 0;
getCudaAttribute<int>(&totalConstantMemory, checkCudaErrors(cuDriverGetVersion(&driverVersion));
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev); printf(" CUDA Driver Version: %d.%d\n",
printf(" Total amount of constant memory: %u bytes\n", driverVersion / 1000,
totalConstantMemory); (driverVersion % 100) / 10);
int sharedMemPerBlock; printf(" CUDA Capability Major/Minor version number: %d.%d\n", major, minor);
getCudaAttribute<int>(&sharedMemPerBlock,
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
printf(" Total amount of shared memory per block: %u bytes\n",
sharedMemPerBlock);
int regsPerBlock;
getCudaAttribute<int>(&regsPerBlock,
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
printf(" Total number of registers available per block: %d\n",
regsPerBlock);
int warpSize;
getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
printf(" Warp size: %d\n", warpSize);
int maxThreadsPerMultiProcessor;
getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
dev);
printf(" Maximum number of threads per multiprocessor: %d\n",
maxThreadsPerMultiProcessor);
int maxThreadsPerBlock;
getCudaAttribute<int>(&maxThreadsPerBlock,
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
printf(" Maximum number of threads per block: %d\n",
maxThreadsPerBlock);
int blockDim[3]; size_t totalGlobalMem;
getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));
dev);
getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
dev);
getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
dev);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
blockDim[0], blockDim[1], blockDim[2]);
int gridDim[3];
getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n",
gridDim[0], gridDim[1], gridDim[2]);
int textureAlign; char msg[256];
getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, SPRINTF(msg,
dev); " Total amount of global memory: %.0f MBytes "
printf(" Texture alignment: %u bytes\n", "(%llu bytes)\n",
textureAlign); (float)totalGlobalMem / 1048576.0f,
(unsigned long long)totalGlobalMem);
printf("%s", msg);
int memPitch; int multiProcessorCount;
getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev); getCudaAttribute<int>(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
printf(" Maximum memory pitch: %u bytes\n",
memPitch);
int gpuOverlap; printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev); multiProcessorCount,
_ConvertSMVer2CoresDRV(major, minor),
_ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);
int asyncEngineCount; int clockRate;
getCudaAttribute<int>(&asyncEngineCount, getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev); printf(" GPU Max Clock rate: %.0f MHz (%0.2f "
printf( "GHz)\n",
" Concurrent copy and kernel execution: %s with %d copy " clockRate * 1e-3f,
"engine(s)\n", clockRate * 1e-6f);
(gpuOverlap ? "Yes" : "No"), asyncEngineCount); int memoryClock;
getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev);
printf(" Memory Clock rate: %.0f Mhz\n", memoryClock * 1e-3f);
int memBusWidth;
getCudaAttribute<int>(&memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
printf(" Memory Bus Width: %d-bit\n", memBusWidth);
int L2CacheSize;
getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);
int kernelExecTimeoutEnabled; if (L2CacheSize) {
getCudaAttribute<int>(&kernelExecTimeoutEnabled, printf(" L2 Cache Size: %d bytes\n", L2CacheSize);
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
printf(" Run time limit on kernels: %s\n",
kernelExecTimeoutEnabled ? "Yes" : "No");
int integrated;
getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
printf(" Integrated GPU sharing Host Memory: %s\n",
integrated ? "Yes" : "No");
int canMapHostMemory;
getCudaAttribute<int>(&canMapHostMemory,
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
printf(" Support host page-locked memory mapping: %s\n",
canMapHostMemory ? "Yes" : "No");
int concurrentKernels;
getCudaAttribute<int>(&concurrentKernels,
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
printf(" Concurrent kernel execution: %s\n",
concurrentKernels ? "Yes" : "No");
int surfaceAlignment;
getCudaAttribute<int>(&surfaceAlignment,
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
printf(" Alignment requirement for Surfaces: %s\n",
surfaceAlignment ? "Yes" : "No");
int eccEnabled;
getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
printf(" Device has ECC support: %s\n",
eccEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
int tccDriver;
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
tccDriver ? "TCC (Tesla Compute Cluster Driver)"
: "WDDM (Windows Display Driver Model)");
#endif
int unifiedAddressing;
getCudaAttribute<int>(&unifiedAddressing,
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
printf(" Device supports Unified Addressing (UVA): %s\n",
unifiedAddressing ? "Yes" : "No");
int managedMemory;
getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
dev);
printf(" Device supports Managed Memory: %s\n",
managedMemory ? "Yes" : "No");
int computePreemption;
getCudaAttribute<int>(&computePreemption,
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
dev);
printf(" Device supports Compute Preemption: %s\n",
computePreemption ? "Yes" : "No");
int cooperativeLaunch;
getCudaAttribute<int>(&cooperativeLaunch,
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
printf(" Supports Cooperative Kernel Launch: %s\n",
cooperativeLaunch ? "Yes" : "No");
int cooperativeMultiDevLaunch;
getCudaAttribute<int>(&cooperativeMultiDevLaunch,
CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
dev);
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n",
cooperativeMultiDevLaunch ? "Yes" : "No");
int pciDomainID, pciBusID, pciDeviceID;
getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n",
pciDomainID, pciBusID, pciDeviceID);
const char *sComputeMode[] = {
"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this "
"device)",
"Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)",
"Unknown", NULL};
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[computeMode]);
}
// If there are 2 or more GPUs, query to determine whether RDMA is supported
if (deviceCount >= 2) {
int gpuid[64]; // we want to find the first two GPUs that can support P2P
int gpu_p2p_count = 0;
int tccDriver = 0;
for (int i = 0; i < deviceCount; i++) {
checkCudaErrors(cuDeviceGetAttribute(
&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
checkCudaErrors(cuDeviceGetAttribute(
&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
// Only boards based on Fermi or later can support P2P
if ((major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// on Windows (64-bit), the Tesla Compute Cluster driver for windows
// must be enabled to support this
&& tccDriver
#endif
) {
// This is an array of P2P capable GPUs
gpuid[gpu_p2p_count++] = i;
}
}
// Show all the combinations of support P2P GPUs
int can_access_peer;
char deviceName0[256], deviceName1[256];
if (gpu_p2p_count >= 2) {
for (int i = 0; i < gpu_p2p_count; i++) {
for (int j = 0; j < gpu_p2p_count; j++) {
if (gpuid[i] == gpuid[j]) {
continue;
}
checkCudaErrors(
cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
printf(
"> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
"%s\n",
deviceName0, gpuid[i], deviceName1, gpuid[j],
can_access_peer ? "Yes" : "No");
} }
}
int maxTex1D, maxTex2D[2], maxTex3D[3];
getCudaAttribute<int>(&maxTex1D, CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
getCudaAttribute<int>(&maxTex2D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
getCudaAttribute<int>(&maxTex3D[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
getCudaAttribute<int>(&maxTex3D[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
printf(" Max Texture Dimension Sizes 1D=(%d) 2D=(%d, %d) "
"3D=(%d, %d, %d)\n",
maxTex1D,
maxTex2D[0],
maxTex2D[1],
maxTex3D[0],
maxTex3D[1],
maxTex3D[2]);
int maxTex1DLayered[2];
getCudaAttribute<int>(&maxTex1DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH, dev);
getCudaAttribute<int>(&maxTex1DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS, dev);
printf(" Maximum Layered 1D Texture Size, (num) layers 1D=(%d), %d layers\n",
maxTex1DLayered[0],
maxTex1DLayered[1]);
int maxTex2DLayered[3];
getCudaAttribute<int>(&maxTex2DLayered[0], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH, dev);
getCudaAttribute<int>(&maxTex2DLayered[1], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT, dev);
getCudaAttribute<int>(&maxTex2DLayered[2], CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS, dev);
printf(" Maximum Layered 2D Texture Size, (num) layers 2D=(%d, %d), %d "
"layers\n",
maxTex2DLayered[0],
maxTex2DLayered[1],
maxTex2DLayered[2]);
int totalConstantMemory;
getCudaAttribute<int>(&totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
printf(" Total amount of constant memory: %u bytes\n", totalConstantMemory);
int sharedMemPerBlock;
getCudaAttribute<int>(&sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
printf(" Total amount of shared memory per block: %u bytes\n", sharedMemPerBlock);
int regsPerBlock;
getCudaAttribute<int>(&regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
printf(" Total number of registers available per block: %d\n", regsPerBlock);
int warpSize;
getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
printf(" Warp size: %d\n", warpSize);
int maxThreadsPerMultiProcessor;
getCudaAttribute<int>(&maxThreadsPerMultiProcessor, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev);
printf(" Maximum number of threads per multiprocessor: %d\n", maxThreadsPerMultiProcessor);
int maxThreadsPerBlock;
getCudaAttribute<int>(&maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
printf(" Maximum number of threads per block: %d\n", maxThreadsPerBlock);
int blockDim[3];
getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev);
getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev);
getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev);
printf(" Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n", blockDim[0], blockDim[1], blockDim[2]);
int gridDim[3];
getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
printf(" Max dimension size of a grid size (x,y,z): (%d, %d, %d)\n", gridDim[0], gridDim[1], gridDim[2]);
int textureAlign;
getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev);
printf(" Texture alignment: %u bytes\n", textureAlign);
int memPitch;
getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
printf(" Maximum memory pitch: %u bytes\n", memPitch);
int gpuOverlap;
getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
int asyncEngineCount;
getCudaAttribute<int>(&asyncEngineCount, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
printf(" Concurrent copy and kernel execution: %s with %d copy "
"engine(s)\n",
(gpuOverlap ? "Yes" : "No"),
asyncEngineCount);
int kernelExecTimeoutEnabled;
getCudaAttribute<int>(&kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
printf(" Run time limit on kernels: %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
int integrated;
getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
printf(" Integrated GPU sharing Host Memory: %s\n", integrated ? "Yes" : "No");
int canMapHostMemory;
getCudaAttribute<int>(&canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
printf(" Support host page-locked memory mapping: %s\n", canMapHostMemory ? "Yes" : "No");
int concurrentKernels;
getCudaAttribute<int>(&concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
printf(" Concurrent kernel execution: %s\n", concurrentKernels ? "Yes" : "No");
int surfaceAlignment;
getCudaAttribute<int>(&surfaceAlignment, CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
printf(" Alignment requirement for Surfaces: %s\n", surfaceAlignment ? "Yes" : "No");
int eccEnabled;
getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
printf(" Device has ECC support: %s\n", eccEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
int tccDriver;
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
printf(" CUDA Device Driver Mode (TCC or WDDM): %s\n",
tccDriver ? "TCC (Tesla Compute Cluster Driver)" : "WDDM (Windows Display Driver Model)");
#endif
int unifiedAddressing;
getCudaAttribute<int>(&unifiedAddressing, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
printf(" Device supports Unified Addressing (UVA): %s\n", unifiedAddressing ? "Yes" : "No");
int managedMemory;
getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev);
printf(" Device supports Managed Memory: %s\n", managedMemory ? "Yes" : "No");
int computePreemption;
getCudaAttribute<int>(&computePreemption, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, dev);
printf(" Device supports Compute Preemption: %s\n", computePreemption ? "Yes" : "No");
int cooperativeLaunch;
getCudaAttribute<int>(&cooperativeLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
printf(" Supports Cooperative Kernel Launch: %s\n", cooperativeLaunch ? "Yes" : "No");
int cooperativeMultiDevLaunch;
getCudaAttribute<int>(&cooperativeMultiDevLaunch, CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH, dev);
printf(" Supports MultiDevice Co-op Kernel Launch: %s\n", cooperativeMultiDevLaunch ? "Yes" : "No");
int pciDomainID, pciBusID, pciDeviceID;
getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
printf(" Device PCI Domain ID / Bus ID / location ID: %d / %d / %d\n", pciDomainID, pciBusID, pciDeviceID);
const char *sComputeMode[] = {"Default (multiple host threads can use ::cudaSetDevice() with device "
"simultaneously)",
"Exclusive (only one host thread in one process is able to use "
"::cudaSetDevice() with this device)",
"Prohibited (no host thread can use ::cudaSetDevice() with this "
"device)",
"Exclusive Process (many threads in one process is able to use "
"::cudaSetDevice() with this device)",
"Unknown",
NULL};
int computeMode;
getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
printf(" Compute Mode:\n");
printf(" < %s >\n", sComputeMode[computeMode]);
} }
}
printf("Result = PASS\n"); // If there are 2 or more GPUs, query to determine whether RDMA is supported
if (deviceCount >= 2) {
int gpuid[64]; // we want to find the first two GPUs that can support P2P
int gpu_p2p_count = 0;
int tccDriver = 0;
exit(EXIT_SUCCESS); for (int i = 0; i < deviceCount; i++) {
checkCudaErrors(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
checkCudaErrors(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);
// Only boards based on Fermi or later can support P2P
if ((major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
// on Windows (64-bit), the Tesla Compute Cluster driver for windows
// must be enabled to support this
&& tccDriver
#endif
) {
// This is an array of P2P capable GPUs
gpuid[gpu_p2p_count++] = i;
}
}
// Show all the combinations of support P2P GPUs
int can_access_peer;
char deviceName0[256], deviceName1[256];
if (gpu_p2p_count >= 2) {
for (int i = 0; i < gpu_p2p_count; i++) {
for (int j = 0; j < gpu_p2p_count; j++) {
if (gpuid[i] == gpuid[j]) {
continue;
}
checkCudaErrors(cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
printf("> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
"%s\n",
deviceName0,
gpuid[i],
deviceName1,
gpuid[j],
can_access_peer ? "Yes" : "No");
}
}
}
}
printf("Result = PASS\n");
exit(EXIT_SUCCESS);
} }

View File

@ -30,4 +30,3 @@ cudaGetDeviceCount, cudaDeviceGetAttribute
Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
## References (for more details) ## References (for more details)

View File

@ -35,48 +35,44 @@
// includes, project // includes, project
#include <helper_cuda.h> #include <helper_cuda.h>
#include <helper_functions.h> // helper for shared that are common to CUDA Samples #include <helper_functions.h> // helper for shared that are common to CUDA Samples
int main(int argc, char **argv) { int main(int argc, char **argv)
int deviceCount = 0; {
checkCudaErrors(cudaGetDeviceCount(&deviceCount)); int deviceCount = 0;
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
// Enumerates Device <-> Device links // Enumerates Device <-> Device links
for (int device1 = 0; device1 < deviceCount; device1++) { for (int device1 = 0; device1 < deviceCount; device1++) {
for (int device2 = 0; device2 < deviceCount; device2++) { for (int device2 = 0; device2 < deviceCount; device2++) {
if (device1 == device2) continue; if (device1 == device2)
continue;
int perfRank = 0; int perfRank = 0;
int atomicSupported = 0; int atomicSupported = 0;
int accessSupported = 0; int accessSupported = 0;
checkCudaErrors(cudaDeviceGetP2PAttribute( checkCudaErrors(
&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2)); cudaDeviceGetP2PAttribute(&accessSupported, cudaDevP2PAttrAccessSupported, device1, device2));
checkCudaErrors(cudaDeviceGetP2PAttribute( checkCudaErrors(cudaDeviceGetP2PAttribute(&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2));
&perfRank, cudaDevP2PAttrPerformanceRank, device1, device2)); checkCudaErrors(
checkCudaErrors(cudaDeviceGetP2PAttribute( cudaDeviceGetP2PAttribute(&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1, device2));
&atomicSupported, cudaDevP2PAttrNativeAtomicSupported, device1,
device2));
if (accessSupported) { if (accessSupported) {
std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" std::cout << "GPU" << device1 << " <-> GPU" << device2 << ":" << std::endl;
<< std::endl; std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
std::cout << " * Atomic Supported: " std::cout << " * Perf Rank: " << perfRank << std::endl;
<< (atomicSupported ? "yes" : "no") << std::endl; }
std::cout << " * Perf Rank: " << perfRank << std::endl; }
}
} }
}
// Enumerates Device <-> Host links // Enumerates Device <-> Host links
for (int device = 0; device < deviceCount; device++) { for (int device = 0; device < deviceCount; device++) {
int atomicSupported = 0; int atomicSupported = 0;
checkCudaErrors(cudaDeviceGetAttribute( checkCudaErrors(cudaDeviceGetAttribute(&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device));
&atomicSupported, cudaDevAttrHostNativeAtomicSupported, device)); std::cout << "GPU" << device << " <-> CPU:" << std::endl;
std::cout << "GPU" << device << " <-> CPU:" << std::endl; std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") << std::endl;
std::cout << " * Atomic Supported: " << (atomicSupported ? "yes" : "no") }
<< std::endl;
}
return 0; return 0;
} }

View File

@ -37,4 +37,3 @@ Download and install the [CUDA Toolkit 12.5](https://developer.nvidia.com/cuda-d
Make sure the dependencies mentioned in [Dependencies]() section above are installed. Make sure the dependencies mentioned in [Dependencies]() section above are installed.
## References (for more details) ## References (for more details)

Some files were not shown because too many files have changed in this diff Show More